churnkit 0.75.1a3__py3-none-any.whl → 0.76.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a2.dist-info}/METADATA +5 -2
  2. {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a2.dist-info}/RECORD +41 -40
  3. customer_retention/__init__.py +11 -1
  4. customer_retention/core/compat/__init__.py +3 -0
  5. customer_retention/core/config/__init__.py +43 -8
  6. customer_retention/core/config/experiments.py +20 -0
  7. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +222 -149
  8. customer_retention/integrations/adapters/factory.py +8 -5
  9. customer_retention/integrations/adapters/feature_store/base.py +1 -0
  10. customer_retention/integrations/adapters/feature_store/databricks.py +58 -10
  11. customer_retention/integrations/adapters/mlflow/base.py +8 -0
  12. customer_retention/integrations/adapters/mlflow/databricks.py +15 -2
  13. customer_retention/integrations/adapters/mlflow/local.py +7 -0
  14. customer_retention/integrations/databricks_init.py +153 -0
  15. customer_retention/stages/profiling/temporal_feature_analyzer.py +3 -3
  16. customer_retention/stages/profiling/temporal_feature_engineer.py +2 -2
  17. customer_retention/stages/profiling/temporal_pattern_analyzer.py +4 -4
  18. customer_retention/stages/profiling/time_series_profiler.py +5 -5
  19. customer_retention/stages/profiling/time_window_aggregator.py +3 -3
  20. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +0 -0
  21. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +0 -0
  22. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +0 -0
  23. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +0 -0
  24. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +0 -0
  25. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +0 -0
  26. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +0 -0
  27. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +0 -0
  28. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +0 -0
  29. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +0 -0
  30. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +0 -0
  31. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +0 -0
  32. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +0 -0
  33. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +0 -0
  34. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +0 -0
  35. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +0 -0
  36. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +0 -0
  37. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +0 -0
  38. {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +0 -0
  39. {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a2.dist-info}/WHEEL +0 -0
  40. {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a2.dist-info}/entry_points.txt +0 -0
  41. {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a2.dist-info}/licenses/LICENSE +0 -0
@@ -8,6 +8,24 @@ from ..base import AdapterResult
8
8
  from .base import FeatureStoreAdapter, FeatureViewConfig
9
9
 
10
10
 
11
+ def _import_feature_engineering_client() -> Any:
12
+ try:
13
+ from databricks.feature_engineering import FeatureEngineeringClient
14
+
15
+ return FeatureEngineeringClient
16
+ except ImportError:
17
+ from databricks.feature_store import FeatureStoreClient
18
+
19
+ return FeatureStoreClient
20
+
21
+
22
+ def _validate_write_mode(mode: str) -> None:
23
+ if mode == "overwrite":
24
+ raise ValueError(
25
+ "FeatureEngineeringClient.write_table only supports mode='merge'. Use mode='merge' instead of 'overwrite'."
26
+ )
27
+
28
+
11
29
  class DatabricksFeatureStore(FeatureStoreAdapter):
12
30
  def __init__(self, catalog: str = "main", schema: str = "default"):
13
31
  if not is_spark_available():
@@ -19,27 +37,46 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
19
37
  @property
20
38
  def fe_client(self) -> Any:
21
39
  if self._fe_client is None:
22
- from databricks.feature_engineering import FeatureEngineeringClient
23
- self._fe_client = FeatureEngineeringClient()
40
+ client_cls = _import_feature_engineering_client()
41
+ self._fe_client = client_cls()
24
42
  return self._fe_client
25
43
 
26
44
  def _full_name(self, name: str) -> str:
27
45
  return f"{self.catalog}.{self.schema}.{name}"
28
46
 
29
- def create_table(self, name: str, schema: Dict[str, str], primary_keys: List[str]) -> AdapterResult:
47
+ def create_table(
48
+ self, name: str, schema: Dict[str, str], primary_keys: List[str], timeseries_column: Optional[str] = None
49
+ ) -> AdapterResult:
30
50
  full_name = self._full_name(name)
31
51
  spark = get_spark_session()
32
52
  df = spark.createDataFrame([], self._schema_to_spark(schema))
33
- self.fe_client.create_table(name=full_name, primary_keys=primary_keys, df=df)
53
+ kwargs: Dict[str, Any] = {"name": full_name, "primary_keys": primary_keys, "df": df}
54
+ if timeseries_column:
55
+ kwargs["timeseries_columns"] = [timeseries_column]
56
+ self.fe_client.create_table(**kwargs)
34
57
  return AdapterResult(success=True, metadata={"name": full_name})
35
58
 
36
59
  def _schema_to_spark(self, schema: Dict[str, str]) -> Any:
37
- from pyspark.sql.types import FloatType, IntegerType, StringType, StructField, StructType
38
- type_map = {"int": IntegerType(), "float": FloatType(), "string": StringType()}
60
+ from pyspark.sql.types import (
61
+ FloatType,
62
+ IntegerType,
63
+ StringType,
64
+ StructField,
65
+ StructType,
66
+ TimestampType,
67
+ )
68
+
69
+ type_map = {
70
+ "int": IntegerType(),
71
+ "float": FloatType(),
72
+ "string": StringType(),
73
+ "timestamp": TimestampType(),
74
+ }
39
75
  fields = [StructField(name, type_map.get(dtype, StringType()), True) for name, dtype in schema.items()]
40
76
  return StructType(fields)
41
77
 
42
78
  def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
79
+ _validate_write_mode(mode)
43
80
  full_name = self._full_name(name)
44
81
  spark = get_spark_session()
45
82
  spark_df = spark.createDataFrame(df)
@@ -72,14 +109,22 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
72
109
  table_name = self._full_name(config.name)
73
110
  spark = get_spark_session()
74
111
  spark_df = spark.createDataFrame(df)
75
- self.fe_client.create_table(name=table_name, primary_keys=[config.entity_key], df=spark_df)
112
+ kwargs: Dict[str, Any] = {"name": table_name, "primary_keys": [config.entity_key], "df": spark_df}
113
+ if hasattr(config, "timeseries_column") and config.timeseries_column:
114
+ kwargs["timeseries_columns"] = [config.timeseries_column]
115
+ self.fe_client.create_table(**kwargs)
76
116
  return table_name
77
117
 
78
118
  def get_historical_features(self, entity_df: pd.DataFrame, feature_refs: List[str]) -> pd.DataFrame:
79
119
  from databricks.feature_engineering import FeatureLookup
120
+
80
121
  spark = get_spark_session()
81
- lookups = [FeatureLookup(table_name=ref.split(":")[0], lookup_key=[entity_df.columns[0]]) for ref in feature_refs]
82
- training_set = self.fe_client.create_training_set(df=spark.createDataFrame(entity_df), feature_lookups=lookups, label=None)
122
+ lookups = [
123
+ FeatureLookup(table_name=ref.split(":")[0], lookup_key=[entity_df.columns[0]]) for ref in feature_refs
124
+ ]
125
+ training_set = self.fe_client.create_training_set(
126
+ df=spark.createDataFrame(entity_df), feature_lookups=lookups, label=None
127
+ )
83
128
  return training_set.load_df().toPandas()
84
129
 
85
130
  def materialize(self, feature_views: List[str], start_date: str, end_date: str) -> None:
@@ -89,6 +134,9 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
89
134
  entity_df = pd.DataFrame(entity_keys)
90
135
  spark = get_spark_session()
91
136
  from databricks.feature_engineering import FeatureLookup
92
- lookups = [FeatureLookup(table_name=ref.split(":")[0], lookup_key=list(entity_keys.keys())) for ref in feature_refs]
137
+
138
+ lookups = [
139
+ FeatureLookup(table_name=ref.split(":")[0], lookup_key=list(entity_keys.keys())) for ref in feature_refs
140
+ ]
93
141
  result = self.fe_client.score_batch(df=spark.createDataFrame(entity_df), feature_lookups=lookups)
94
142
  return result.toPandas().to_dict()
@@ -30,3 +30,11 @@ class MLflowAdapter(ABC):
30
30
  @abstractmethod
31
31
  def transition_stage(self, model_name: str, version: str, stage: str) -> None:
32
32
  pass
33
+
34
+ @abstractmethod
35
+ def set_alias(self, model_name: str, alias: str, version: str) -> None:
36
+ pass
37
+
38
+ @abstractmethod
39
+ def get_model_by_alias(self, model_name: str, alias: str) -> Any:
40
+ pass
@@ -7,9 +7,12 @@ from .base import MLflowAdapter
7
7
  try:
8
8
  import mlflow
9
9
  from mlflow.tracking import MlflowClient
10
+
10
11
  MLFLOW_AVAILABLE = True
12
+ MLFLOW_MAJOR_VERSION = int(mlflow.__version__.split(".")[0])
11
13
  except ImportError:
12
14
  MLFLOW_AVAILABLE = False
15
+ MLFLOW_MAJOR_VERSION = 0
13
16
 
14
17
 
15
18
  class DatabricksMLflow(MLflowAdapter):
@@ -18,7 +21,8 @@ class DatabricksMLflow(MLflowAdapter):
18
21
  raise ImportError("PySpark required for DatabricksMLflow")
19
22
  if not MLFLOW_AVAILABLE:
20
23
  raise ImportError("mlflow package required")
21
- mlflow.set_registry_uri(registry_uri)
24
+ if MLFLOW_MAJOR_VERSION < 3:
25
+ mlflow.set_registry_uri(registry_uri)
22
26
  self.registry_uri = registry_uri
23
27
  self._client = MlflowClient()
24
28
  self._run_id = None
@@ -44,7 +48,10 @@ class DatabricksMLflow(MLflowAdapter):
44
48
  mlflow.log_metrics(metrics)
45
49
 
46
50
  def log_model(self, model: Any, artifact_path: str, registered_name: Optional[str] = None) -> str:
47
- info = mlflow.sklearn.log_model(model, artifact_path, registered_model_name=registered_name)
51
+ if MLFLOW_MAJOR_VERSION >= 3:
52
+ info = mlflow.sklearn.log_model(model, name=artifact_path, registered_model_name=registered_name)
53
+ else:
54
+ info = mlflow.sklearn.log_model(model, artifact_path, registered_model_name=registered_name)
48
55
  return info.model_uri
49
56
 
50
57
  def load_model(self, model_uri: str) -> Any:
@@ -52,3 +59,9 @@ class DatabricksMLflow(MLflowAdapter):
52
59
 
53
60
  def transition_stage(self, model_name: str, version: str, stage: str) -> None:
54
61
  self._client.set_model_version_tag(name=model_name, version=version, key="stage", value=stage)
62
+
63
+ def set_alias(self, model_name: str, alias: str, version: str) -> None:
64
+ self._client.set_registered_model_alias(name=model_name, alias=alias, version=version)
65
+
66
+ def get_model_by_alias(self, model_name: str, alias: str) -> Any:
67
+ return self._client.get_model_version_by_alias(name=model_name, alias=alias)
@@ -5,6 +5,7 @@ from .base import MLflowAdapter
5
5
  try:
6
6
  import mlflow
7
7
  from mlflow.tracking import MlflowClient
8
+
8
9
  MLFLOW_AVAILABLE = True
9
10
  except ImportError:
10
11
  MLFLOW_AVAILABLE = False
@@ -48,3 +49,9 @@ class LocalMLflow(MLflowAdapter):
48
49
 
49
50
  def transition_stage(self, model_name: str, version: str, stage: str) -> None:
50
51
  self._client.transition_model_version_stage(name=model_name, version=version, stage=stage)
52
+
53
+ def set_alias(self, model_name: str, alias: str, version: str) -> None:
54
+ self._client.set_registered_model_alias(name=model_name, alias=alias, version=version)
55
+
56
+ def get_model_by_alias(self, model_name: str, alias: str) -> Any:
57
+ return self._client.get_model_version_by_alias(name=model_name, alias=alias)
@@ -0,0 +1,153 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import shutil
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+
10
+ @dataclass
11
+ class DatabricksInitResult:
12
+ catalog: str
13
+ schema: str
14
+ experiment_name: str
15
+ workspace_path: str | None
16
+ model_name: str
17
+ notebooks_copied: list[str] = field(default_factory=list)
18
+
19
+ @property
20
+ def environment_variables(self) -> dict[str, str]:
21
+ env_vars = {
22
+ "CR_CATALOG": self.catalog,
23
+ "CR_SCHEMA": self.schema,
24
+ "CR_EXPERIMENT_NAME": self.experiment_name,
25
+ "CR_EXPERIMENTS_DIR": f"/Workspace/{self.workspace_path}/experiments" if self.workspace_path else "",
26
+ }
27
+ if self.workspace_path:
28
+ env_vars["CR_WORKSPACE_PATH"] = self.workspace_path
29
+ return env_vars
30
+
31
+
32
+ def databricks_init(
33
+ catalog: str = "main",
34
+ schema: str = "default",
35
+ experiment_name: str | None = None,
36
+ workspace_path: str | None = None,
37
+ copy_notebooks: bool = True,
38
+ model_name: str = "customer_retention",
39
+ ) -> DatabricksInitResult:
40
+ _validate_databricks_environment()
41
+ _set_environment_variables(catalog, schema, workspace_path)
42
+ resolved_experiment_name = experiment_name or _resolve_experiment_name_from_notebook_path()
43
+ resolved_experiment_name = _make_absolute_experiment_path(resolved_experiment_name, workspace_path)
44
+ _set_experiment_name_env_var(resolved_experiment_name)
45
+ _configure_mlflow_experiment(resolved_experiment_name)
46
+ notebooks_copied: list[str] = []
47
+ if copy_notebooks and workspace_path:
48
+ notebooks_copied = _copy_exploration_notebooks(workspace_path)
49
+ result = DatabricksInitResult(
50
+ catalog=catalog,
51
+ schema=schema,
52
+ experiment_name=resolved_experiment_name,
53
+ workspace_path=workspace_path,
54
+ model_name=model_name,
55
+ notebooks_copied=notebooks_copied,
56
+ )
57
+ _display_init_summary(result)
58
+ return result
59
+
60
+
61
+ def _validate_databricks_environment() -> None:
62
+ if not os.environ.get("DATABRICKS_RUNTIME_VERSION"):
63
+ raise RuntimeError(
64
+ "databricks_init() must be called from a Databricks notebook. "
65
+ "DATABRICKS_RUNTIME_VERSION not found in environment."
66
+ )
67
+
68
+
69
+ def _set_environment_variables(catalog: str, schema: str, workspace_path: str | None) -> None:
70
+ os.environ["CR_CATALOG"] = catalog
71
+ os.environ["CR_SCHEMA"] = schema
72
+ if workspace_path:
73
+ os.environ["CR_WORKSPACE_PATH"] = workspace_path
74
+ os.environ["CR_EXPERIMENTS_DIR"] = f"/Workspace/{workspace_path}/experiments"
75
+
76
+
77
+ def _set_experiment_name_env_var(experiment_name: str) -> None:
78
+ os.environ["CR_EXPERIMENT_NAME"] = experiment_name
79
+
80
+
81
+ def _resolve_experiment_name_from_notebook_path() -> str:
82
+ try:
83
+ dbutils = _get_dbutils()
84
+ if dbutils:
85
+ notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
86
+ return notebook_path.rsplit("/", 1)[-1]
87
+ except Exception:
88
+ pass
89
+ return "customer_retention"
90
+
91
+
92
+ def _get_dbutils() -> Any | None:
93
+ try:
94
+ from customer_retention.core.compat.detection import get_dbutils
95
+
96
+ return get_dbutils()
97
+ except Exception:
98
+ return None
99
+
100
+
101
+ def _make_absolute_experiment_path(experiment_name: str, workspace_path: str | None) -> str:
102
+ if experiment_name.startswith("/"):
103
+ return experiment_name
104
+ if not workspace_path:
105
+ return experiment_name
106
+ base = workspace_path.removeprefix("/Workspace")
107
+ if not base.startswith("/"):
108
+ base = f"/{base}"
109
+ return f"{base}/{experiment_name}"
110
+
111
+
112
+ def _configure_mlflow_experiment(experiment_name: str) -> None:
113
+ try:
114
+ import mlflow
115
+
116
+ mlflow.set_experiment(experiment_name)
117
+ except ImportError:
118
+ pass
119
+
120
+
121
+ def _copy_exploration_notebooks(workspace_path: str) -> list[str]:
122
+ from customer_retention.generators.notebook_generator.project_init import ProjectInitializer
123
+
124
+ source_dir = ProjectInitializer(project_name="")._get_exploration_source_dir()
125
+ if not source_dir or not source_dir.exists():
126
+ return []
127
+
128
+ dest_dir = Path(f"/Workspace/{workspace_path}/exploration_notebooks")
129
+ dest_dir.mkdir(parents=True, exist_ok=True)
130
+
131
+ copied = []
132
+ for notebook in source_dir.glob("*.ipynb"):
133
+ dest_path = dest_dir / notebook.name
134
+ if not dest_path.exists():
135
+ shutil.copy2(notebook, dest_path)
136
+ copied.append(str(dest_path))
137
+
138
+ return copied
139
+
140
+
141
+ def _display_init_summary(result: DatabricksInitResult) -> None:
142
+ print("ChurnKit Databricks Initialization Complete")
143
+ print("=" * 45)
144
+ print(f" Catalog: {result.catalog}")
145
+ print(f" Schema: {result.schema}")
146
+ print(f" Experiment: {result.experiment_name}")
147
+ print(f" Workspace Path: {result.workspace_path or '(not set)'}")
148
+ print(f" Model Name: {result.model_name}")
149
+ if result.notebooks_copied:
150
+ print(f" Notebooks Copied: {len(result.notebooks_copied)}")
151
+ for nb in result.notebooks_copied:
152
+ print(f" - {nb}")
153
+ print("=" * 45)
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple
5
5
  import numpy as np
6
6
  from scipy import stats
7
7
 
8
- from customer_retention.core.compat import DataFrame, ensure_datetime_column, pd, qcut, to_pandas
8
+ from customer_retention.core.compat import DataFrame, ensure_datetime_column, native_pd, pd, qcut, to_pandas
9
9
  from customer_retention.core.utils import compute_effect_size
10
10
 
11
11
 
@@ -642,7 +642,7 @@ class TemporalFeatureAnalyzer:
642
642
  )
643
643
 
644
644
  def _calculate_iv(self, feature: pd.Series, target: pd.Series, bins: int = 10) -> float:
645
- df_iv = pd.DataFrame({"feature": feature, "target": target}).dropna()
645
+ df_iv = native_pd.DataFrame({"feature": feature, "target": target}).dropna()
646
646
  if len(df_iv) < bins * 2:
647
647
  return 0.0
648
648
  try:
@@ -666,7 +666,7 @@ class TemporalFeatureAnalyzer:
666
666
  return float(grouped["iv"].sum())
667
667
 
668
668
  def _calculate_ks(self, feature: pd.Series, target: pd.Series) -> Tuple[float, float]:
669
- df_ks = pd.DataFrame({"feature": feature, "target": target}).dropna()
669
+ df_ks = native_pd.DataFrame({"feature": feature, "target": target}).dropna()
670
670
  group0, group1 = df_ks[df_ks["target"] == 0]["feature"], df_ks[df_ks["target"] == 1]["feature"]
671
671
  if len(group0) == 0 or len(group1) == 0:
672
672
  return 0.0, 1.0
@@ -25,7 +25,7 @@ from typing import Any, Dict, List, Optional
25
25
 
26
26
  import numpy as np
27
27
 
28
- from customer_retention.core.compat import Timedelta, pd, to_datetime, to_pandas
28
+ from customer_retention.core.compat import Timedelta, native_pd, pd, to_datetime, to_pandas
29
29
 
30
30
 
31
31
  class ReferenceMode(Enum):
@@ -307,7 +307,7 @@ class TemporalFeatureEngineer:
307
307
 
308
308
  if self.config.reference_mode == ReferenceMode.GLOBAL_DATE:
309
309
  ref_date = self.config.global_reference_date or datetime.now()
310
- return pd.DataFrame({
310
+ return native_pd.DataFrame({
311
311
  entity_col: entities,
312
312
  "reference_date": ref_date,
313
313
  })
@@ -10,7 +10,7 @@ from customer_retention.core.compat import (
10
10
  Timestamp,
11
11
  cut,
12
12
  ensure_datetime_column,
13
- pd,
13
+ native_pd,
14
14
  safe_to_datetime,
15
15
  to_pandas,
16
16
  )
@@ -316,7 +316,7 @@ def _diagnose_anomaly_pattern(
316
316
  entity_first = df.groupby(entity_column)[time_column].min()
317
317
  entity_last = df.groupby(entity_column)[time_column].max()
318
318
  tenure = (entity_last - entity_first).dt.days
319
- tenure_by_target = pd.DataFrame({"target": entity_target, "tenure": tenure})
319
+ tenure_by_target = native_pd.DataFrame({"target": entity_target, "tenure": tenure})
320
320
  retained_tenure = tenure_by_target[tenure_by_target["target"] == 1]["tenure"]
321
321
  churned_tenure = tenure_by_target[tenure_by_target["target"] == 0]["tenure"]
322
322
  retained_median_tenure = float(retained_tenure.median()) if len(retained_tenure) > 0 else None
@@ -597,7 +597,7 @@ class TemporalPatternAnalyzer:
597
597
 
598
598
  def analyze_cohorts(self, df: DataFrame, entity_column: str, cohort_column: str, target_column: Optional[str] = None, period: str = "M") -> DataFrame:
599
599
  if len(df) == 0:
600
- return pd.DataFrame()
600
+ return native_pd.DataFrame()
601
601
 
602
602
  df_copy = to_pandas(df).copy()
603
603
  ensure_datetime_column(df_copy, cohort_column)
@@ -638,7 +638,7 @@ class TemporalPatternAnalyzer:
638
638
  target_correlation = None
639
639
  if target_column and target_column in df.columns:
640
640
  entity_target = df.groupby(entity_column)[target_column].first()
641
- combined = pd.DataFrame({"recency": recency_days, "target": entity_target}).dropna()
641
+ combined = native_pd.DataFrame({"recency": recency_days, "target": entity_target}).dropna()
642
642
 
643
643
  if len(combined) > 2:
644
644
  corr, _ = stats.pearsonr(combined["recency"], combined["target"])
@@ -7,7 +7,7 @@ from customer_retention.core.compat import (
7
7
  DataFrame,
8
8
  Timestamp,
9
9
  ensure_datetime_column,
10
- pd,
10
+ native_pd,
11
11
  to_pandas,
12
12
  )
13
13
 
@@ -95,7 +95,7 @@ def classify_lifecycle_quadrants(entity_lifecycles: DataFrame) -> LifecycleQuadr
95
95
  lifecycles=lc,
96
96
  tenure_threshold=tenure_threshold,
97
97
  intensity_threshold=intensity_threshold,
98
- recommendations=pd.DataFrame(rows),
98
+ recommendations=native_pd.DataFrame(rows),
99
99
  )
100
100
 
101
101
 
@@ -160,7 +160,7 @@ def classify_activity_segments(entity_lifecycles: DataFrame) -> ActivitySegmentR
160
160
  lifecycles=lc,
161
161
  q25_threshold=q25,
162
162
  q75_threshold=q75,
163
- recommendations=pd.DataFrame(rows),
163
+ recommendations=native_pd.DataFrame(rows),
164
164
  )
165
165
 
166
166
 
@@ -237,7 +237,7 @@ class TimeSeriesProfiler:
237
237
  def _compute_entity_lifecycles(self, df: DataFrame) -> DataFrame:
238
238
  grouped = df.groupby(self.entity_column)[self.time_column]
239
239
 
240
- lifecycles = pd.DataFrame({
240
+ lifecycles = native_pd.DataFrame({
241
241
  "entity": grouped.first().index.tolist(),
242
242
  "first_event": grouped.min().values,
243
243
  "last_event": grouped.max().values,
@@ -302,7 +302,7 @@ class TimeSeriesProfiler:
302
302
  events_per_entity=DistributionStats(
303
303
  min=0, max=0, mean=0, median=0, std=0, q25=0, q75=0
304
304
  ),
305
- entity_lifecycles=pd.DataFrame(columns=[
305
+ entity_lifecycles=native_pd.DataFrame(columns=[
306
306
  "entity", "first_event", "last_event", "duration_days", "event_count"
307
307
  ]),
308
308
  avg_inter_event_days=None,
@@ -12,7 +12,7 @@ from customer_retention.core.compat import (
12
12
  Timestamp,
13
13
  ensure_datetime_column,
14
14
  is_numeric_dtype,
15
- pd,
15
+ native_pd,
16
16
  to_pandas,
17
17
  )
18
18
 
@@ -85,7 +85,7 @@ class TimeWindowAggregator:
85
85
  ) -> DataFrame:
86
86
  df = to_pandas(df)
87
87
  if len(df) == 0:
88
- return pd.DataFrame()
88
+ return native_pd.DataFrame()
89
89
 
90
90
  df = df.copy()
91
91
  ensure_datetime_column(df, self.time_column)
@@ -113,7 +113,7 @@ class TimeWindowAggregator:
113
113
  if include_tenure:
114
114
  result_data["days_since_first_event"] = self._compute_tenure(df, entities, reference_date)
115
115
 
116
- result = pd.DataFrame(result_data)
116
+ result = native_pd.DataFrame(result_data)
117
117
  result.attrs["aggregation_reference_date"] = (
118
118
  reference_date.isoformat() if hasattr(reference_date, "isoformat") else str(reference_date))
119
119
  result.attrs["aggregation_timestamp"] = Timestamp.now().isoformat()