churnkit 0.75.1a3__py3-none-any.whl → 0.76.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a1.dist-info}/METADATA +5 -2
  2. {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a1.dist-info}/RECORD +41 -40
  3. customer_retention/__init__.py +11 -1
  4. customer_retention/core/compat/__init__.py +3 -0
  5. customer_retention/core/config/__init__.py +43 -8
  6. customer_retention/core/config/experiments.py +20 -0
  7. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +222 -149
  8. customer_retention/integrations/adapters/factory.py +8 -5
  9. customer_retention/integrations/adapters/feature_store/base.py +1 -0
  10. customer_retention/integrations/adapters/feature_store/databricks.py +58 -10
  11. customer_retention/integrations/adapters/mlflow/base.py +8 -0
  12. customer_retention/integrations/adapters/mlflow/databricks.py +15 -2
  13. customer_retention/integrations/adapters/mlflow/local.py +7 -0
  14. customer_retention/integrations/databricks_init.py +141 -0
  15. customer_retention/stages/profiling/temporal_feature_analyzer.py +3 -3
  16. customer_retention/stages/profiling/temporal_feature_engineer.py +2 -2
  17. customer_retention/stages/profiling/temporal_pattern_analyzer.py +4 -3
  18. customer_retention/stages/profiling/time_series_profiler.py +5 -4
  19. customer_retention/stages/profiling/time_window_aggregator.py +3 -2
  20. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +0 -0
  21. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +0 -0
  22. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +0 -0
  23. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +0 -0
  24. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +0 -0
  25. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +0 -0
  26. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +0 -0
  27. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +0 -0
  28. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +0 -0
  29. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +0 -0
  30. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +0 -0
  31. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +0 -0
  32. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +0 -0
  33. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +0 -0
  34. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +0 -0
  35. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +0 -0
  36. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +0 -0
  37. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +0 -0
  38. {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +0 -0
  39. {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a1.dist-info}/WHEEL +0 -0
  40. {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a1.dist-info}/entry_points.txt +0 -0
  41. {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a1.dist-info}/licenses/LICENSE +0 -0
@@ -8,6 +8,24 @@ from ..base import AdapterResult
8
8
  from .base import FeatureStoreAdapter, FeatureViewConfig
9
9
 
10
10
 
11
+ def _import_feature_engineering_client() -> Any:
12
+ try:
13
+ from databricks.feature_engineering import FeatureEngineeringClient
14
+
15
+ return FeatureEngineeringClient
16
+ except ImportError:
17
+ from databricks.feature_store import FeatureStoreClient
18
+
19
+ return FeatureStoreClient
20
+
21
+
22
+ def _validate_write_mode(mode: str) -> None:
23
+ if mode == "overwrite":
24
+ raise ValueError(
25
+ "FeatureEngineeringClient.write_table only supports mode='merge'. Use mode='merge' instead of 'overwrite'."
26
+ )
27
+
28
+
11
29
  class DatabricksFeatureStore(FeatureStoreAdapter):
12
30
  def __init__(self, catalog: str = "main", schema: str = "default"):
13
31
  if not is_spark_available():
@@ -19,27 +37,46 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
19
37
  @property
20
38
  def fe_client(self) -> Any:
21
39
  if self._fe_client is None:
22
- from databricks.feature_engineering import FeatureEngineeringClient
23
- self._fe_client = FeatureEngineeringClient()
40
+ client_cls = _import_feature_engineering_client()
41
+ self._fe_client = client_cls()
24
42
  return self._fe_client
25
43
 
26
44
  def _full_name(self, name: str) -> str:
27
45
  return f"{self.catalog}.{self.schema}.{name}"
28
46
 
29
- def create_table(self, name: str, schema: Dict[str, str], primary_keys: List[str]) -> AdapterResult:
47
+ def create_table(
48
+ self, name: str, schema: Dict[str, str], primary_keys: List[str], timeseries_column: Optional[str] = None
49
+ ) -> AdapterResult:
30
50
  full_name = self._full_name(name)
31
51
  spark = get_spark_session()
32
52
  df = spark.createDataFrame([], self._schema_to_spark(schema))
33
- self.fe_client.create_table(name=full_name, primary_keys=primary_keys, df=df)
53
+ kwargs: Dict[str, Any] = {"name": full_name, "primary_keys": primary_keys, "df": df}
54
+ if timeseries_column:
55
+ kwargs["timeseries_columns"] = [timeseries_column]
56
+ self.fe_client.create_table(**kwargs)
34
57
  return AdapterResult(success=True, metadata={"name": full_name})
35
58
 
36
59
  def _schema_to_spark(self, schema: Dict[str, str]) -> Any:
37
- from pyspark.sql.types import FloatType, IntegerType, StringType, StructField, StructType
38
- type_map = {"int": IntegerType(), "float": FloatType(), "string": StringType()}
60
+ from pyspark.sql.types import (
61
+ FloatType,
62
+ IntegerType,
63
+ StringType,
64
+ StructField,
65
+ StructType,
66
+ TimestampType,
67
+ )
68
+
69
+ type_map = {
70
+ "int": IntegerType(),
71
+ "float": FloatType(),
72
+ "string": StringType(),
73
+ "timestamp": TimestampType(),
74
+ }
39
75
  fields = [StructField(name, type_map.get(dtype, StringType()), True) for name, dtype in schema.items()]
40
76
  return StructType(fields)
41
77
 
42
78
  def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
79
+ _validate_write_mode(mode)
43
80
  full_name = self._full_name(name)
44
81
  spark = get_spark_session()
45
82
  spark_df = spark.createDataFrame(df)
@@ -72,14 +109,22 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
72
109
  table_name = self._full_name(config.name)
73
110
  spark = get_spark_session()
74
111
  spark_df = spark.createDataFrame(df)
75
- self.fe_client.create_table(name=table_name, primary_keys=[config.entity_key], df=spark_df)
112
+ kwargs: Dict[str, Any] = {"name": table_name, "primary_keys": [config.entity_key], "df": spark_df}
113
+ if hasattr(config, "timeseries_column") and config.timeseries_column:
114
+ kwargs["timeseries_columns"] = [config.timeseries_column]
115
+ self.fe_client.create_table(**kwargs)
76
116
  return table_name
77
117
 
78
118
  def get_historical_features(self, entity_df: pd.DataFrame, feature_refs: List[str]) -> pd.DataFrame:
79
119
  from databricks.feature_engineering import FeatureLookup
120
+
80
121
  spark = get_spark_session()
81
- lookups = [FeatureLookup(table_name=ref.split(":")[0], lookup_key=[entity_df.columns[0]]) for ref in feature_refs]
82
- training_set = self.fe_client.create_training_set(df=spark.createDataFrame(entity_df), feature_lookups=lookups, label=None)
122
+ lookups = [
123
+ FeatureLookup(table_name=ref.split(":")[0], lookup_key=[entity_df.columns[0]]) for ref in feature_refs
124
+ ]
125
+ training_set = self.fe_client.create_training_set(
126
+ df=spark.createDataFrame(entity_df), feature_lookups=lookups, label=None
127
+ )
83
128
  return training_set.load_df().toPandas()
84
129
 
85
130
  def materialize(self, feature_views: List[str], start_date: str, end_date: str) -> None:
@@ -89,6 +134,9 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
89
134
  entity_df = pd.DataFrame(entity_keys)
90
135
  spark = get_spark_session()
91
136
  from databricks.feature_engineering import FeatureLookup
92
- lookups = [FeatureLookup(table_name=ref.split(":")[0], lookup_key=list(entity_keys.keys())) for ref in feature_refs]
137
+
138
+ lookups = [
139
+ FeatureLookup(table_name=ref.split(":")[0], lookup_key=list(entity_keys.keys())) for ref in feature_refs
140
+ ]
93
141
  result = self.fe_client.score_batch(df=spark.createDataFrame(entity_df), feature_lookups=lookups)
94
142
  return result.toPandas().to_dict()
@@ -30,3 +30,11 @@ class MLflowAdapter(ABC):
30
30
  @abstractmethod
31
31
  def transition_stage(self, model_name: str, version: str, stage: str) -> None:
32
32
  pass
33
+
34
+ @abstractmethod
35
+ def set_alias(self, model_name: str, alias: str, version: str) -> None:
36
+ pass
37
+
38
+ @abstractmethod
39
+ def get_model_by_alias(self, model_name: str, alias: str) -> Any:
40
+ pass
@@ -7,9 +7,12 @@ from .base import MLflowAdapter
7
7
  try:
8
8
  import mlflow
9
9
  from mlflow.tracking import MlflowClient
10
+
10
11
  MLFLOW_AVAILABLE = True
12
+ MLFLOW_MAJOR_VERSION = int(mlflow.__version__.split(".")[0])
11
13
  except ImportError:
12
14
  MLFLOW_AVAILABLE = False
15
+ MLFLOW_MAJOR_VERSION = 0
13
16
 
14
17
 
15
18
  class DatabricksMLflow(MLflowAdapter):
@@ -18,7 +21,8 @@ class DatabricksMLflow(MLflowAdapter):
18
21
  raise ImportError("PySpark required for DatabricksMLflow")
19
22
  if not MLFLOW_AVAILABLE:
20
23
  raise ImportError("mlflow package required")
21
- mlflow.set_registry_uri(registry_uri)
24
+ if MLFLOW_MAJOR_VERSION < 3:
25
+ mlflow.set_registry_uri(registry_uri)
22
26
  self.registry_uri = registry_uri
23
27
  self._client = MlflowClient()
24
28
  self._run_id = None
@@ -44,7 +48,10 @@ class DatabricksMLflow(MLflowAdapter):
44
48
  mlflow.log_metrics(metrics)
45
49
 
46
50
  def log_model(self, model: Any, artifact_path: str, registered_name: Optional[str] = None) -> str:
47
- info = mlflow.sklearn.log_model(model, artifact_path, registered_model_name=registered_name)
51
+ if MLFLOW_MAJOR_VERSION >= 3:
52
+ info = mlflow.sklearn.log_model(model, name=artifact_path, registered_model_name=registered_name)
53
+ else:
54
+ info = mlflow.sklearn.log_model(model, artifact_path, registered_model_name=registered_name)
48
55
  return info.model_uri
49
56
 
50
57
  def load_model(self, model_uri: str) -> Any:
@@ -52,3 +59,9 @@ class DatabricksMLflow(MLflowAdapter):
52
59
 
53
60
  def transition_stage(self, model_name: str, version: str, stage: str) -> None:
54
61
  self._client.set_model_version_tag(name=model_name, version=version, key="stage", value=stage)
62
+
63
+ def set_alias(self, model_name: str, alias: str, version: str) -> None:
64
+ self._client.set_registered_model_alias(name=model_name, alias=alias, version=version)
65
+
66
+ def get_model_by_alias(self, model_name: str, alias: str) -> Any:
67
+ return self._client.get_model_version_by_alias(name=model_name, alias=alias)
@@ -5,6 +5,7 @@ from .base import MLflowAdapter
5
5
  try:
6
6
  import mlflow
7
7
  from mlflow.tracking import MlflowClient
8
+
8
9
  MLFLOW_AVAILABLE = True
9
10
  except ImportError:
10
11
  MLFLOW_AVAILABLE = False
@@ -48,3 +49,9 @@ class LocalMLflow(MLflowAdapter):
48
49
 
49
50
  def transition_stage(self, model_name: str, version: str, stage: str) -> None:
50
51
  self._client.transition_model_version_stage(name=model_name, version=version, stage=stage)
52
+
53
+ def set_alias(self, model_name: str, alias: str, version: str) -> None:
54
+ self._client.set_registered_model_alias(name=model_name, alias=alias, version=version)
55
+
56
+ def get_model_by_alias(self, model_name: str, alias: str) -> Any:
57
+ return self._client.get_model_version_by_alias(name=model_name, alias=alias)
@@ -0,0 +1,141 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import shutil
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+
10
+ @dataclass
11
+ class DatabricksInitResult:
12
+ catalog: str
13
+ schema: str
14
+ experiment_name: str
15
+ workspace_path: str | None
16
+ model_name: str
17
+ notebooks_copied: list[str] = field(default_factory=list)
18
+
19
+ @property
20
+ def environment_variables(self) -> dict[str, str]:
21
+ env_vars = {
22
+ "CR_CATALOG": self.catalog,
23
+ "CR_SCHEMA": self.schema,
24
+ "CR_EXPERIMENT_NAME": self.experiment_name,
25
+ "CR_EXPERIMENTS_DIR": f"/Workspace/{self.workspace_path}/experiments" if self.workspace_path else "",
26
+ }
27
+ if self.workspace_path:
28
+ env_vars["CR_WORKSPACE_PATH"] = self.workspace_path
29
+ return env_vars
30
+
31
+
32
+ def databricks_init(
33
+ catalog: str = "main",
34
+ schema: str = "default",
35
+ experiment_name: str | None = None,
36
+ workspace_path: str | None = None,
37
+ copy_notebooks: bool = True,
38
+ model_name: str = "customer_retention",
39
+ ) -> DatabricksInitResult:
40
+ _validate_databricks_environment()
41
+ _set_environment_variables(catalog, schema, workspace_path)
42
+ resolved_experiment_name = experiment_name or _resolve_experiment_name_from_notebook_path()
43
+ _set_experiment_name_env_var(resolved_experiment_name)
44
+ _configure_mlflow_experiment(resolved_experiment_name)
45
+ notebooks_copied: list[str] = []
46
+ if copy_notebooks and workspace_path:
47
+ notebooks_copied = _copy_exploration_notebooks(workspace_path)
48
+ result = DatabricksInitResult(
49
+ catalog=catalog,
50
+ schema=schema,
51
+ experiment_name=resolved_experiment_name,
52
+ workspace_path=workspace_path,
53
+ model_name=model_name,
54
+ notebooks_copied=notebooks_copied,
55
+ )
56
+ _display_init_summary(result)
57
+ return result
58
+
59
+
60
+ def _validate_databricks_environment() -> None:
61
+ if not os.environ.get("DATABRICKS_RUNTIME_VERSION"):
62
+ raise RuntimeError(
63
+ "databricks_init() must be called from a Databricks notebook. "
64
+ "DATABRICKS_RUNTIME_VERSION not found in environment."
65
+ )
66
+
67
+
68
+ def _set_environment_variables(catalog: str, schema: str, workspace_path: str | None) -> None:
69
+ os.environ["CR_CATALOG"] = catalog
70
+ os.environ["CR_SCHEMA"] = schema
71
+ if workspace_path:
72
+ os.environ["CR_WORKSPACE_PATH"] = workspace_path
73
+ os.environ["CR_EXPERIMENTS_DIR"] = f"/Workspace/{workspace_path}/experiments"
74
+
75
+
76
+ def _set_experiment_name_env_var(experiment_name: str) -> None:
77
+ os.environ["CR_EXPERIMENT_NAME"] = experiment_name
78
+
79
+
80
+ def _resolve_experiment_name_from_notebook_path() -> str:
81
+ try:
82
+ dbutils = _get_dbutils()
83
+ if dbutils:
84
+ notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
85
+ return notebook_path.rsplit("/", 1)[-1]
86
+ except Exception:
87
+ pass
88
+ return "customer_retention"
89
+
90
+
91
+ def _get_dbutils() -> Any | None:
92
+ try:
93
+ from customer_retention.core.compat.detection import get_dbutils
94
+
95
+ return get_dbutils()
96
+ except Exception:
97
+ return None
98
+
99
+
100
+ def _configure_mlflow_experiment(experiment_name: str) -> None:
101
+ try:
102
+ import mlflow
103
+
104
+ mlflow.set_experiment(experiment_name)
105
+ except ImportError:
106
+ pass
107
+
108
+
109
+ def _copy_exploration_notebooks(workspace_path: str) -> list[str]:
110
+ from customer_retention.generators.notebook_generator.project_init import ProjectInitializer
111
+
112
+ source_dir = ProjectInitializer(project_name="")._get_exploration_source_dir()
113
+ if not source_dir or not source_dir.exists():
114
+ return []
115
+
116
+ dest_dir = Path(f"/Workspace/{workspace_path}/exploration_notebooks")
117
+ dest_dir.mkdir(parents=True, exist_ok=True)
118
+
119
+ copied = []
120
+ for notebook in source_dir.glob("*.ipynb"):
121
+ dest_path = dest_dir / notebook.name
122
+ if not dest_path.exists():
123
+ shutil.copy2(notebook, dest_path)
124
+ copied.append(str(dest_path))
125
+
126
+ return copied
127
+
128
+
129
+ def _display_init_summary(result: DatabricksInitResult) -> None:
130
+ print("ChurnKit Databricks Initialization Complete")
131
+ print("=" * 45)
132
+ print(f" Catalog: {result.catalog}")
133
+ print(f" Schema: {result.schema}")
134
+ print(f" Experiment: {result.experiment_name}")
135
+ print(f" Workspace Path: {result.workspace_path or '(not set)'}")
136
+ print(f" Model Name: {result.model_name}")
137
+ if result.notebooks_copied:
138
+ print(f" Notebooks Copied: {len(result.notebooks_copied)}")
139
+ for nb in result.notebooks_copied:
140
+ print(f" - {nb}")
141
+ print("=" * 45)
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple
5
5
  import numpy as np
6
6
  from scipy import stats
7
7
 
8
- from customer_retention.core.compat import DataFrame, ensure_datetime_column, pd, qcut, to_pandas
8
+ from customer_retention.core.compat import DataFrame, ensure_datetime_column, native_pd, pd, qcut, to_pandas
9
9
  from customer_retention.core.utils import compute_effect_size
10
10
 
11
11
 
@@ -642,7 +642,7 @@ class TemporalFeatureAnalyzer:
642
642
  )
643
643
 
644
644
  def _calculate_iv(self, feature: pd.Series, target: pd.Series, bins: int = 10) -> float:
645
- df_iv = pd.DataFrame({"feature": feature, "target": target}).dropna()
645
+ df_iv = native_pd.DataFrame({"feature": feature, "target": target}).dropna()
646
646
  if len(df_iv) < bins * 2:
647
647
  return 0.0
648
648
  try:
@@ -666,7 +666,7 @@ class TemporalFeatureAnalyzer:
666
666
  return float(grouped["iv"].sum())
667
667
 
668
668
  def _calculate_ks(self, feature: pd.Series, target: pd.Series) -> Tuple[float, float]:
669
- df_ks = pd.DataFrame({"feature": feature, "target": target}).dropna()
669
+ df_ks = native_pd.DataFrame({"feature": feature, "target": target}).dropna()
670
670
  group0, group1 = df_ks[df_ks["target"] == 0]["feature"], df_ks[df_ks["target"] == 1]["feature"]
671
671
  if len(group0) == 0 or len(group1) == 0:
672
672
  return 0.0, 1.0
@@ -25,7 +25,7 @@ from typing import Any, Dict, List, Optional
25
25
 
26
26
  import numpy as np
27
27
 
28
- from customer_retention.core.compat import Timedelta, pd, to_datetime, to_pandas
28
+ from customer_retention.core.compat import Timedelta, native_pd, pd, to_datetime, to_pandas
29
29
 
30
30
 
31
31
  class ReferenceMode(Enum):
@@ -307,7 +307,7 @@ class TemporalFeatureEngineer:
307
307
 
308
308
  if self.config.reference_mode == ReferenceMode.GLOBAL_DATE:
309
309
  ref_date = self.config.global_reference_date or datetime.now()
310
- return pd.DataFrame({
310
+ return native_pd.DataFrame({
311
311
  entity_col: entities,
312
312
  "reference_date": ref_date,
313
313
  })
@@ -10,6 +10,7 @@ from customer_retention.core.compat import (
10
10
  Timestamp,
11
11
  cut,
12
12
  ensure_datetime_column,
13
+ native_pd,
13
14
  pd,
14
15
  safe_to_datetime,
15
16
  to_pandas,
@@ -316,7 +317,7 @@ def _diagnose_anomaly_pattern(
316
317
  entity_first = df.groupby(entity_column)[time_column].min()
317
318
  entity_last = df.groupby(entity_column)[time_column].max()
318
319
  tenure = (entity_last - entity_first).dt.days
319
- tenure_by_target = pd.DataFrame({"target": entity_target, "tenure": tenure})
320
+ tenure_by_target = native_pd.DataFrame({"target": entity_target, "tenure": tenure})
320
321
  retained_tenure = tenure_by_target[tenure_by_target["target"] == 1]["tenure"]
321
322
  churned_tenure = tenure_by_target[tenure_by_target["target"] == 0]["tenure"]
322
323
  retained_median_tenure = float(retained_tenure.median()) if len(retained_tenure) > 0 else None
@@ -597,7 +598,7 @@ class TemporalPatternAnalyzer:
597
598
 
598
599
  def analyze_cohorts(self, df: DataFrame, entity_column: str, cohort_column: str, target_column: Optional[str] = None, period: str = "M") -> DataFrame:
599
600
  if len(df) == 0:
600
- return pd.DataFrame()
601
+ return native_pd.DataFrame()
601
602
 
602
603
  df_copy = to_pandas(df).copy()
603
604
  ensure_datetime_column(df_copy, cohort_column)
@@ -638,7 +639,7 @@ class TemporalPatternAnalyzer:
638
639
  target_correlation = None
639
640
  if target_column and target_column in df.columns:
640
641
  entity_target = df.groupby(entity_column)[target_column].first()
641
- combined = pd.DataFrame({"recency": recency_days, "target": entity_target}).dropna()
642
+ combined = native_pd.DataFrame({"recency": recency_days, "target": entity_target}).dropna()
642
643
 
643
644
  if len(combined) > 2:
644
645
  corr, _ = stats.pearsonr(combined["recency"], combined["target"])
@@ -7,6 +7,7 @@ from customer_retention.core.compat import (
7
7
  DataFrame,
8
8
  Timestamp,
9
9
  ensure_datetime_column,
10
+ native_pd,
10
11
  pd,
11
12
  to_pandas,
12
13
  )
@@ -95,7 +96,7 @@ def classify_lifecycle_quadrants(entity_lifecycles: DataFrame) -> LifecycleQuadr
95
96
  lifecycles=lc,
96
97
  tenure_threshold=tenure_threshold,
97
98
  intensity_threshold=intensity_threshold,
98
- recommendations=pd.DataFrame(rows),
99
+ recommendations=native_pd.DataFrame(rows),
99
100
  )
100
101
 
101
102
 
@@ -160,7 +161,7 @@ def classify_activity_segments(entity_lifecycles: DataFrame) -> ActivitySegmentR
160
161
  lifecycles=lc,
161
162
  q25_threshold=q25,
162
163
  q75_threshold=q75,
163
- recommendations=pd.DataFrame(rows),
164
+ recommendations=native_pd.DataFrame(rows),
164
165
  )
165
166
 
166
167
 
@@ -237,7 +238,7 @@ class TimeSeriesProfiler:
237
238
  def _compute_entity_lifecycles(self, df: DataFrame) -> DataFrame:
238
239
  grouped = df.groupby(self.entity_column)[self.time_column]
239
240
 
240
- lifecycles = pd.DataFrame({
241
+ lifecycles = native_pd.DataFrame({
241
242
  "entity": grouped.first().index.tolist(),
242
243
  "first_event": grouped.min().values,
243
244
  "last_event": grouped.max().values,
@@ -302,7 +303,7 @@ class TimeSeriesProfiler:
302
303
  events_per_entity=DistributionStats(
303
304
  min=0, max=0, mean=0, median=0, std=0, q25=0, q75=0
304
305
  ),
305
- entity_lifecycles=pd.DataFrame(columns=[
306
+ entity_lifecycles=native_pd.DataFrame(columns=[
306
307
  "entity", "first_event", "last_event", "duration_days", "event_count"
307
308
  ]),
308
309
  avg_inter_event_days=None,
@@ -12,6 +12,7 @@ from customer_retention.core.compat import (
12
12
  Timestamp,
13
13
  ensure_datetime_column,
14
14
  is_numeric_dtype,
15
+ native_pd,
15
16
  pd,
16
17
  to_pandas,
17
18
  )
@@ -85,7 +86,7 @@ class TimeWindowAggregator:
85
86
  ) -> DataFrame:
86
87
  df = to_pandas(df)
87
88
  if len(df) == 0:
88
- return pd.DataFrame()
89
+ return native_pd.DataFrame()
89
90
 
90
91
  df = df.copy()
91
92
  ensure_datetime_column(df, self.time_column)
@@ -113,7 +114,7 @@ class TimeWindowAggregator:
113
114
  if include_tenure:
114
115
  result_data["days_since_first_event"] = self._compute_tenure(df, entities, reference_date)
115
116
 
116
- result = pd.DataFrame(result_data)
117
+ result = native_pd.DataFrame(result_data)
117
118
  result.attrs["aggregation_reference_date"] = (
118
119
  reference_date.isoformat() if hasattr(reference_date, "isoformat") else str(reference_date))
119
120
  result.attrs["aggregation_timestamp"] = Timestamp.now().isoformat()