churnkit 0.75.1a2__py3-none-any.whl → 0.76.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {churnkit-0.75.1a2.dist-info → churnkit-0.76.0a1.dist-info}/METADATA +5 -2
  2. {churnkit-0.75.1a2.dist-info → churnkit-0.76.0a1.dist-info}/RECORD +48 -47
  3. customer_retention/__init__.py +11 -1
  4. customer_retention/analysis/visualization/chart_builder.py +6 -7
  5. customer_retention/core/compat/__init__.py +53 -0
  6. customer_retention/core/config/__init__.py +43 -8
  7. customer_retention/core/config/experiments.py +20 -0
  8. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +2 -1
  9. customer_retention/generators/pipeline_generator/renderer.py +7 -5
  10. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +223 -149
  11. customer_retention/integrations/adapters/factory.py +8 -5
  12. customer_retention/integrations/adapters/feature_store/base.py +1 -0
  13. customer_retention/integrations/adapters/feature_store/databricks.py +58 -10
  14. customer_retention/integrations/adapters/mlflow/base.py +8 -0
  15. customer_retention/integrations/adapters/mlflow/databricks.py +15 -2
  16. customer_retention/integrations/adapters/mlflow/local.py +7 -0
  17. customer_retention/integrations/databricks_init.py +141 -0
  18. customer_retention/stages/features/temporal_features.py +12 -12
  19. customer_retention/stages/profiling/pattern_analysis_config.py +4 -3
  20. customer_retention/stages/profiling/temporal_feature_analyzer.py +5 -5
  21. customer_retention/stages/profiling/temporal_feature_engineer.py +2 -2
  22. customer_retention/stages/profiling/temporal_pattern_analyzer.py +22 -8
  23. customer_retention/stages/profiling/temporal_quality_checks.py +9 -5
  24. customer_retention/stages/profiling/time_series_profiler.py +9 -9
  25. customer_retention/stages/profiling/time_window_aggregator.py +7 -4
  26. customer_retention/stages/transformation/datetime_transformer.py +10 -2
  27. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +0 -0
  28. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +0 -0
  29. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +0 -0
  30. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +0 -0
  31. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +0 -0
  32. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +0 -0
  33. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +0 -0
  34. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +0 -0
  35. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +0 -0
  36. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +0 -0
  37. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +0 -0
  38. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +0 -0
  39. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +0 -0
  40. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +0 -0
  41. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +0 -0
  42. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +0 -0
  43. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +0 -0
  44. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +0 -0
  45. {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +0 -0
  46. {churnkit-0.75.1a2.dist-info → churnkit-0.76.0a1.dist-info}/WHEEL +0 -0
  47. {churnkit-0.75.1a2.dist-info → churnkit-0.76.0a1.dist-info}/entry_points.txt +0 -0
  48. {churnkit-0.75.1a2.dist-info → churnkit-0.76.0a1.dist-info}/licenses/LICENSE +0 -0
@@ -8,6 +8,24 @@ from ..base import AdapterResult
8
8
  from .base import FeatureStoreAdapter, FeatureViewConfig
9
9
 
10
10
 
11
+ def _import_feature_engineering_client() -> Any:
12
+ try:
13
+ from databricks.feature_engineering import FeatureEngineeringClient
14
+
15
+ return FeatureEngineeringClient
16
+ except ImportError:
17
+ from databricks.feature_store import FeatureStoreClient
18
+
19
+ return FeatureStoreClient
20
+
21
+
22
+ def _validate_write_mode(mode: str) -> None:
23
+ if mode == "overwrite":
24
+ raise ValueError(
25
+ "FeatureEngineeringClient.write_table only supports mode='merge'. Use mode='merge' instead of 'overwrite'."
26
+ )
27
+
28
+
11
29
  class DatabricksFeatureStore(FeatureStoreAdapter):
12
30
  def __init__(self, catalog: str = "main", schema: str = "default"):
13
31
  if not is_spark_available():
@@ -19,27 +37,46 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
19
37
  @property
20
38
  def fe_client(self) -> Any:
21
39
  if self._fe_client is None:
22
- from databricks.feature_engineering import FeatureEngineeringClient
23
- self._fe_client = FeatureEngineeringClient()
40
+ client_cls = _import_feature_engineering_client()
41
+ self._fe_client = client_cls()
24
42
  return self._fe_client
25
43
 
26
44
  def _full_name(self, name: str) -> str:
27
45
  return f"{self.catalog}.{self.schema}.{name}"
28
46
 
29
- def create_table(self, name: str, schema: Dict[str, str], primary_keys: List[str]) -> AdapterResult:
47
+ def create_table(
48
+ self, name: str, schema: Dict[str, str], primary_keys: List[str], timeseries_column: Optional[str] = None
49
+ ) -> AdapterResult:
30
50
  full_name = self._full_name(name)
31
51
  spark = get_spark_session()
32
52
  df = spark.createDataFrame([], self._schema_to_spark(schema))
33
- self.fe_client.create_table(name=full_name, primary_keys=primary_keys, df=df)
53
+ kwargs: Dict[str, Any] = {"name": full_name, "primary_keys": primary_keys, "df": df}
54
+ if timeseries_column:
55
+ kwargs["timeseries_columns"] = [timeseries_column]
56
+ self.fe_client.create_table(**kwargs)
34
57
  return AdapterResult(success=True, metadata={"name": full_name})
35
58
 
36
59
  def _schema_to_spark(self, schema: Dict[str, str]) -> Any:
37
- from pyspark.sql.types import FloatType, IntegerType, StringType, StructField, StructType
38
- type_map = {"int": IntegerType(), "float": FloatType(), "string": StringType()}
60
+ from pyspark.sql.types import (
61
+ FloatType,
62
+ IntegerType,
63
+ StringType,
64
+ StructField,
65
+ StructType,
66
+ TimestampType,
67
+ )
68
+
69
+ type_map = {
70
+ "int": IntegerType(),
71
+ "float": FloatType(),
72
+ "string": StringType(),
73
+ "timestamp": TimestampType(),
74
+ }
39
75
  fields = [StructField(name, type_map.get(dtype, StringType()), True) for name, dtype in schema.items()]
40
76
  return StructType(fields)
41
77
 
42
78
  def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
79
+ _validate_write_mode(mode)
43
80
  full_name = self._full_name(name)
44
81
  spark = get_spark_session()
45
82
  spark_df = spark.createDataFrame(df)
@@ -72,14 +109,22 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
72
109
  table_name = self._full_name(config.name)
73
110
  spark = get_spark_session()
74
111
  spark_df = spark.createDataFrame(df)
75
- self.fe_client.create_table(name=table_name, primary_keys=[config.entity_key], df=spark_df)
112
+ kwargs: Dict[str, Any] = {"name": table_name, "primary_keys": [config.entity_key], "df": spark_df}
113
+ if hasattr(config, "timeseries_column") and config.timeseries_column:
114
+ kwargs["timeseries_columns"] = [config.timeseries_column]
115
+ self.fe_client.create_table(**kwargs)
76
116
  return table_name
77
117
 
78
118
  def get_historical_features(self, entity_df: pd.DataFrame, feature_refs: List[str]) -> pd.DataFrame:
79
119
  from databricks.feature_engineering import FeatureLookup
120
+
80
121
  spark = get_spark_session()
81
- lookups = [FeatureLookup(table_name=ref.split(":")[0], lookup_key=[entity_df.columns[0]]) for ref in feature_refs]
82
- training_set = self.fe_client.create_training_set(df=spark.createDataFrame(entity_df), feature_lookups=lookups, label=None)
122
+ lookups = [
123
+ FeatureLookup(table_name=ref.split(":")[0], lookup_key=[entity_df.columns[0]]) for ref in feature_refs
124
+ ]
125
+ training_set = self.fe_client.create_training_set(
126
+ df=spark.createDataFrame(entity_df), feature_lookups=lookups, label=None
127
+ )
83
128
  return training_set.load_df().toPandas()
84
129
 
85
130
  def materialize(self, feature_views: List[str], start_date: str, end_date: str) -> None:
@@ -89,6 +134,9 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
89
134
  entity_df = pd.DataFrame(entity_keys)
90
135
  spark = get_spark_session()
91
136
  from databricks.feature_engineering import FeatureLookup
92
- lookups = [FeatureLookup(table_name=ref.split(":")[0], lookup_key=list(entity_keys.keys())) for ref in feature_refs]
137
+
138
+ lookups = [
139
+ FeatureLookup(table_name=ref.split(":")[0], lookup_key=list(entity_keys.keys())) for ref in feature_refs
140
+ ]
93
141
  result = self.fe_client.score_batch(df=spark.createDataFrame(entity_df), feature_lookups=lookups)
94
142
  return result.toPandas().to_dict()
@@ -30,3 +30,11 @@ class MLflowAdapter(ABC):
30
30
  @abstractmethod
31
31
  def transition_stage(self, model_name: str, version: str, stage: str) -> None:
32
32
  pass
33
+
34
+ @abstractmethod
35
+ def set_alias(self, model_name: str, alias: str, version: str) -> None:
36
+ pass
37
+
38
+ @abstractmethod
39
+ def get_model_by_alias(self, model_name: str, alias: str) -> Any:
40
+ pass
@@ -7,9 +7,12 @@ from .base import MLflowAdapter
7
7
  try:
8
8
  import mlflow
9
9
  from mlflow.tracking import MlflowClient
10
+
10
11
  MLFLOW_AVAILABLE = True
12
+ MLFLOW_MAJOR_VERSION = int(mlflow.__version__.split(".")[0])
11
13
  except ImportError:
12
14
  MLFLOW_AVAILABLE = False
15
+ MLFLOW_MAJOR_VERSION = 0
13
16
 
14
17
 
15
18
  class DatabricksMLflow(MLflowAdapter):
@@ -18,7 +21,8 @@ class DatabricksMLflow(MLflowAdapter):
18
21
  raise ImportError("PySpark required for DatabricksMLflow")
19
22
  if not MLFLOW_AVAILABLE:
20
23
  raise ImportError("mlflow package required")
21
- mlflow.set_registry_uri(registry_uri)
24
+ if MLFLOW_MAJOR_VERSION < 3:
25
+ mlflow.set_registry_uri(registry_uri)
22
26
  self.registry_uri = registry_uri
23
27
  self._client = MlflowClient()
24
28
  self._run_id = None
@@ -44,7 +48,10 @@ class DatabricksMLflow(MLflowAdapter):
44
48
  mlflow.log_metrics(metrics)
45
49
 
46
50
  def log_model(self, model: Any, artifact_path: str, registered_name: Optional[str] = None) -> str:
47
- info = mlflow.sklearn.log_model(model, artifact_path, registered_model_name=registered_name)
51
+ if MLFLOW_MAJOR_VERSION >= 3:
52
+ info = mlflow.sklearn.log_model(model, name=artifact_path, registered_model_name=registered_name)
53
+ else:
54
+ info = mlflow.sklearn.log_model(model, artifact_path, registered_model_name=registered_name)
48
55
  return info.model_uri
49
56
 
50
57
  def load_model(self, model_uri: str) -> Any:
@@ -52,3 +59,9 @@ class DatabricksMLflow(MLflowAdapter):
52
59
 
53
60
  def transition_stage(self, model_name: str, version: str, stage: str) -> None:
54
61
  self._client.set_model_version_tag(name=model_name, version=version, key="stage", value=stage)
62
+
63
+ def set_alias(self, model_name: str, alias: str, version: str) -> None:
64
+ self._client.set_registered_model_alias(name=model_name, alias=alias, version=version)
65
+
66
+ def get_model_by_alias(self, model_name: str, alias: str) -> Any:
67
+ return self._client.get_model_version_by_alias(name=model_name, alias=alias)
@@ -5,6 +5,7 @@ from .base import MLflowAdapter
5
5
  try:
6
6
  import mlflow
7
7
  from mlflow.tracking import MlflowClient
8
+
8
9
  MLFLOW_AVAILABLE = True
9
10
  except ImportError:
10
11
  MLFLOW_AVAILABLE = False
@@ -48,3 +49,9 @@ class LocalMLflow(MLflowAdapter):
48
49
 
49
50
  def transition_stage(self, model_name: str, version: str, stage: str) -> None:
50
51
  self._client.transition_model_version_stage(name=model_name, version=version, stage=stage)
52
+
53
+ def set_alias(self, model_name: str, alias: str, version: str) -> None:
54
+ self._client.set_registered_model_alias(name=model_name, alias=alias, version=version)
55
+
56
+ def get_model_by_alias(self, model_name: str, alias: str) -> Any:
57
+ return self._client.get_model_version_by_alias(name=model_name, alias=alias)
@@ -0,0 +1,141 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import shutil
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+
10
+ @dataclass
11
+ class DatabricksInitResult:
12
+ catalog: str
13
+ schema: str
14
+ experiment_name: str
15
+ workspace_path: str | None
16
+ model_name: str
17
+ notebooks_copied: list[str] = field(default_factory=list)
18
+
19
+ @property
20
+ def environment_variables(self) -> dict[str, str]:
21
+ env_vars = {
22
+ "CR_CATALOG": self.catalog,
23
+ "CR_SCHEMA": self.schema,
24
+ "CR_EXPERIMENT_NAME": self.experiment_name,
25
+ "CR_EXPERIMENTS_DIR": f"/Workspace/{self.workspace_path}/experiments" if self.workspace_path else "",
26
+ }
27
+ if self.workspace_path:
28
+ env_vars["CR_WORKSPACE_PATH"] = self.workspace_path
29
+ return env_vars
30
+
31
+
32
+ def databricks_init(
33
+ catalog: str = "main",
34
+ schema: str = "default",
35
+ experiment_name: str | None = None,
36
+ workspace_path: str | None = None,
37
+ copy_notebooks: bool = True,
38
+ model_name: str = "customer_retention",
39
+ ) -> DatabricksInitResult:
40
+ _validate_databricks_environment()
41
+ _set_environment_variables(catalog, schema, workspace_path)
42
+ resolved_experiment_name = experiment_name or _resolve_experiment_name_from_notebook_path()
43
+ _set_experiment_name_env_var(resolved_experiment_name)
44
+ _configure_mlflow_experiment(resolved_experiment_name)
45
+ notebooks_copied: list[str] = []
46
+ if copy_notebooks and workspace_path:
47
+ notebooks_copied = _copy_exploration_notebooks(workspace_path)
48
+ result = DatabricksInitResult(
49
+ catalog=catalog,
50
+ schema=schema,
51
+ experiment_name=resolved_experiment_name,
52
+ workspace_path=workspace_path,
53
+ model_name=model_name,
54
+ notebooks_copied=notebooks_copied,
55
+ )
56
+ _display_init_summary(result)
57
+ return result
58
+
59
+
60
+ def _validate_databricks_environment() -> None:
61
+ if not os.environ.get("DATABRICKS_RUNTIME_VERSION"):
62
+ raise RuntimeError(
63
+ "databricks_init() must be called from a Databricks notebook. "
64
+ "DATABRICKS_RUNTIME_VERSION not found in environment."
65
+ )
66
+
67
+
68
+ def _set_environment_variables(catalog: str, schema: str, workspace_path: str | None) -> None:
69
+ os.environ["CR_CATALOG"] = catalog
70
+ os.environ["CR_SCHEMA"] = schema
71
+ if workspace_path:
72
+ os.environ["CR_WORKSPACE_PATH"] = workspace_path
73
+ os.environ["CR_EXPERIMENTS_DIR"] = f"/Workspace/{workspace_path}/experiments"
74
+
75
+
76
+ def _set_experiment_name_env_var(experiment_name: str) -> None:
77
+ os.environ["CR_EXPERIMENT_NAME"] = experiment_name
78
+
79
+
80
+ def _resolve_experiment_name_from_notebook_path() -> str:
81
+ try:
82
+ dbutils = _get_dbutils()
83
+ if dbutils:
84
+ notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
85
+ return notebook_path.rsplit("/", 1)[-1]
86
+ except Exception:
87
+ pass
88
+ return "customer_retention"
89
+
90
+
91
+ def _get_dbutils() -> Any | None:
92
+ try:
93
+ from customer_retention.core.compat.detection import get_dbutils
94
+
95
+ return get_dbutils()
96
+ except Exception:
97
+ return None
98
+
99
+
100
+ def _configure_mlflow_experiment(experiment_name: str) -> None:
101
+ try:
102
+ import mlflow
103
+
104
+ mlflow.set_experiment(experiment_name)
105
+ except ImportError:
106
+ pass
107
+
108
+
109
+ def _copy_exploration_notebooks(workspace_path: str) -> list[str]:
110
+ from customer_retention.generators.notebook_generator.project_init import ProjectInitializer
111
+
112
+ source_dir = ProjectInitializer(project_name="")._get_exploration_source_dir()
113
+ if not source_dir or not source_dir.exists():
114
+ return []
115
+
116
+ dest_dir = Path(f"/Workspace/{workspace_path}/exploration_notebooks")
117
+ dest_dir.mkdir(parents=True, exist_ok=True)
118
+
119
+ copied = []
120
+ for notebook in source_dir.glob("*.ipynb"):
121
+ dest_path = dest_dir / notebook.name
122
+ if not dest_path.exists():
123
+ shutil.copy2(notebook, dest_path)
124
+ copied.append(str(dest_path))
125
+
126
+ return copied
127
+
128
+
129
+ def _display_init_summary(result: DatabricksInitResult) -> None:
130
+ print("ChurnKit Databricks Initialization Complete")
131
+ print("=" * 45)
132
+ print(f" Catalog: {result.catalog}")
133
+ print(f" Schema: {result.schema}")
134
+ print(f" Experiment: {result.experiment_name}")
135
+ print(f" Workspace Path: {result.workspace_path or '(not set)'}")
136
+ print(f" Model Name: {result.model_name}")
137
+ if result.notebooks_copied:
138
+ print(f" Notebooks Copied: {len(result.notebooks_copied)}")
139
+ for nb in result.notebooks_copied:
140
+ print(f" - {nb}")
141
+ print("=" * 45)
@@ -10,7 +10,7 @@ from dataclasses import dataclass, field
10
10
  from enum import Enum
11
11
  from typing import List, Optional, Union
12
12
 
13
- from customer_retention.core.compat import DataFrame, Series, Timedelta, Timestamp, pd
13
+ from customer_retention.core.compat import DataFrame, Series, Timedelta, Timestamp, pd, safe_to_datetime, to_pandas
14
14
 
15
15
 
16
16
  class ReferenceDateSource(Enum):
@@ -122,19 +122,19 @@ class TemporalFeatureGenerator:
122
122
  if not self._is_fitted:
123
123
  raise ValueError("Generator not fitted. Call fit() first.")
124
124
 
125
- result = df.copy()
125
+ result = to_pandas(df).copy()
126
126
  self.generated_features = []
127
127
  warnings_list = []
128
128
 
129
129
  # Get reference date(s) for this transform
130
130
  if self.reference_date_source in [ReferenceDateSource.COLUMN, ReferenceDateSource.FEATURE_TIMESTAMP]:
131
- ref_dates = pd.to_datetime(df[self.reference_date_column], format='mixed')
131
+ ref_dates = safe_to_datetime(df[self.reference_date_column])
132
132
  else:
133
133
  ref_dates = self.reference_date
134
134
 
135
135
  # Tenure features
136
136
  if self.created_column and self.created_column in df.columns:
137
- created = pd.to_datetime(df[self.created_column], format='mixed')
137
+ created = safe_to_datetime(df[self.created_column])
138
138
  tenure_days = self._compute_days_diff(ref_dates, created)
139
139
  result["tenure_days"] = tenure_days
140
140
  self.generated_features.append("tenure_days")
@@ -154,7 +154,7 @@ class TemporalFeatureGenerator:
154
154
 
155
155
  # Recency features
156
156
  if self.last_order_column and self.last_order_column in df.columns:
157
- last_order = pd.to_datetime(df[self.last_order_column], format='mixed')
157
+ last_order = safe_to_datetime(df[self.last_order_column])
158
158
  days_since_last = self._compute_days_diff(ref_dates, last_order)
159
159
  result["days_since_last_order"] = days_since_last
160
160
  self.generated_features.append("days_since_last_order")
@@ -162,8 +162,8 @@ class TemporalFeatureGenerator:
162
162
  # Activation features
163
163
  if (self.first_order_column and self.first_order_column in df.columns and
164
164
  self.created_column and self.created_column in df.columns):
165
- created = pd.to_datetime(df[self.created_column], format='mixed')
166
- first_order = pd.to_datetime(df[self.first_order_column], format='mixed')
165
+ created = safe_to_datetime(df[self.created_column])
166
+ first_order = safe_to_datetime(df[self.first_order_column])
167
167
  days_to_first = self._compute_days_diff(first_order, created)
168
168
  result["days_to_first_order"] = days_to_first
169
169
  self.generated_features.append("days_to_first_order")
@@ -171,8 +171,8 @@ class TemporalFeatureGenerator:
171
171
  # Active period
172
172
  if (self.first_order_column and self.first_order_column in df.columns and
173
173
  self.last_order_column and self.last_order_column in df.columns):
174
- first_order = pd.to_datetime(df[self.first_order_column], format='mixed')
175
- last_order = pd.to_datetime(df[self.last_order_column], format='mixed')
174
+ first_order = safe_to_datetime(df[self.first_order_column])
175
+ last_order = safe_to_datetime(df[self.last_order_column])
176
176
  active_period = self._compute_days_diff(last_order, first_order)
177
177
  result["active_period_days"] = active_period
178
178
  self.generated_features.append("active_period_days")
@@ -210,21 +210,21 @@ class TemporalFeatureGenerator:
210
210
  raise ValueError(
211
211
  "date_column must be provided when source is MAX_DATE"
212
212
  )
213
- self.reference_date = pd.to_datetime(df[self.date_column], format='mixed').max()
213
+ self.reference_date = safe_to_datetime(df[self.date_column]).max()
214
214
 
215
215
  elif self.reference_date_source == ReferenceDateSource.COLUMN:
216
216
  if self.reference_date_column is None:
217
217
  raise ValueError(
218
218
  "reference_date_column must be provided when source is COLUMN"
219
219
  )
220
- self.reference_date = pd.to_datetime(df[self.reference_date_column], format='mixed')
220
+ self.reference_date = safe_to_datetime(df[self.reference_date_column])
221
221
 
222
222
  elif self.reference_date_source == ReferenceDateSource.FEATURE_TIMESTAMP:
223
223
  if "feature_timestamp" not in df.columns:
224
224
  raise ValueError(
225
225
  "feature_timestamp column required when source is FEATURE_TIMESTAMP"
226
226
  )
227
- self.reference_date = pd.to_datetime(df["feature_timestamp"], format='mixed')
227
+ self.reference_date = safe_to_datetime(df["feature_timestamp"])
228
228
  self.reference_date_column = "feature_timestamp"
229
229
 
230
230
  def _compute_days_diff(
@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
 
7
- from customer_retention.core.compat import DataFrame
7
+ from customer_retention.core.compat import DataFrame, ensure_datetime_column, to_pandas
8
8
 
9
9
 
10
10
  @dataclass
@@ -216,12 +216,13 @@ class SparklineDataBuilder:
216
216
  self.freq = freq
217
217
 
218
218
  def build(self, df: DataFrame, columns: List[str]) -> Tuple[List[SparklineData], bool]:
219
- import pandas as pd
219
+ df = to_pandas(df)
220
220
  has_target = self.target_column is not None and self.target_column in df.columns
221
221
  if has_target:
222
222
  validate_not_event_level(df, self.entity_column, self.target_column)
223
223
  df_work = self._prepare_working_df(df, has_target)
224
- df_work['_period'] = pd.to_datetime(df_work[self.time_column]).dt.to_period(self.freq).dt.start_time
224
+ ensure_datetime_column(df_work, self.time_column)
225
+ df_work['_period'] = df_work[self.time_column].dt.to_period(self.freq).dt.start_time
225
226
  results = [self._build_sparkline_for_column(df_work, col, has_target)
226
227
  for col in columns if col in df_work.columns]
227
228
  return results, has_target
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple
5
5
  import numpy as np
6
6
  from scipy import stats
7
7
 
8
- from customer_retention.core.compat import DataFrame, pd, qcut, to_datetime
8
+ from customer_retention.core.compat import DataFrame, ensure_datetime_column, native_pd, pd, qcut, to_pandas
9
9
  from customer_retention.core.utils import compute_effect_size
10
10
 
11
11
 
@@ -626,8 +626,8 @@ class TemporalFeatureAnalyzer:
626
626
  return next_priority
627
627
 
628
628
  def _prepare_dataframe(self, df: DataFrame) -> DataFrame:
629
- df = df.copy()
630
- df[self.time_column] = to_datetime(df[self.time_column])
629
+ df = to_pandas(df).copy()
630
+ ensure_datetime_column(df, self.time_column)
631
631
  return df
632
632
 
633
633
  def _validate_event_level_target_usage(self, df: DataFrame, target_column: Optional[str]) -> None:
@@ -642,7 +642,7 @@ class TemporalFeatureAnalyzer:
642
642
  )
643
643
 
644
644
  def _calculate_iv(self, feature: pd.Series, target: pd.Series, bins: int = 10) -> float:
645
- df_iv = pd.DataFrame({"feature": feature, "target": target}).dropna()
645
+ df_iv = native_pd.DataFrame({"feature": feature, "target": target}).dropna()
646
646
  if len(df_iv) < bins * 2:
647
647
  return 0.0
648
648
  try:
@@ -666,7 +666,7 @@ class TemporalFeatureAnalyzer:
666
666
  return float(grouped["iv"].sum())
667
667
 
668
668
  def _calculate_ks(self, feature: pd.Series, target: pd.Series) -> Tuple[float, float]:
669
- df_ks = pd.DataFrame({"feature": feature, "target": target}).dropna()
669
+ df_ks = native_pd.DataFrame({"feature": feature, "target": target}).dropna()
670
670
  group0, group1 = df_ks[df_ks["target"] == 0]["feature"], df_ks[df_ks["target"] == 1]["feature"]
671
671
  if len(group0) == 0 or len(group1) == 0:
672
672
  return 0.0, 1.0
@@ -25,7 +25,7 @@ from typing import Any, Dict, List, Optional
25
25
 
26
26
  import numpy as np
27
27
 
28
- from customer_retention.core.compat import Timedelta, pd, to_datetime, to_pandas
28
+ from customer_retention.core.compat import Timedelta, native_pd, pd, to_datetime, to_pandas
29
29
 
30
30
 
31
31
  class ReferenceMode(Enum):
@@ -307,7 +307,7 @@ class TemporalFeatureEngineer:
307
307
 
308
308
  if self.config.reference_mode == ReferenceMode.GLOBAL_DATE:
309
309
  ref_date = self.config.global_reference_date or datetime.now()
310
- return pd.DataFrame({
310
+ return native_pd.DataFrame({
311
311
  entity_col: entities,
312
312
  "reference_date": ref_date,
313
313
  })
@@ -5,7 +5,16 @@ from typing import Dict, List, Optional, Tuple
5
5
  import numpy as np
6
6
  from scipy import stats
7
7
 
8
- from customer_retention.core.compat import DataFrame, Timestamp, cut, pd, to_datetime, to_pandas
8
+ from customer_retention.core.compat import (
9
+ DataFrame,
10
+ Timestamp,
11
+ cut,
12
+ ensure_datetime_column,
13
+ native_pd,
14
+ pd,
15
+ safe_to_datetime,
16
+ to_pandas,
17
+ )
9
18
  from customer_retention.core.utils import compute_effect_size
10
19
 
11
20
 
@@ -177,6 +186,8 @@ def generate_trend_recommendations(trend: TrendResult, mean_value: float = 1.0)
177
186
 
178
187
 
179
188
  def analyze_cohort_distribution(first_events: DataFrame, time_column: str) -> CohortDistribution:
189
+ first_events = to_pandas(first_events)
190
+ ensure_datetime_column(first_events, time_column)
180
191
  years = first_events[time_column].dt.year
181
192
  year_counts = years.value_counts().sort_index().to_dict()
182
193
  total = len(first_events)
@@ -232,6 +243,7 @@ def compute_recency_buckets(
232
243
  reference_date: Timestamp, bucket_edges: Optional[List[float]] = None
233
244
  ) -> List[RecencyBucketStats]:
234
245
  df = to_pandas(df)
246
+ ensure_datetime_column(df, time_column)
235
247
  edges = bucket_edges or DEFAULT_BUCKET_EDGES
236
248
  labels = _generate_bucket_labels(edges)
237
249
  entity_last = df.groupby(entity_column)[time_column].max().reset_index()
@@ -298,13 +310,14 @@ def _diagnose_anomaly_pattern(
298
310
  df: DataFrame, entity_column: str, time_column: str, target_column: str
299
311
  ) -> AnomalyDiagnostics:
300
312
  df = to_pandas(df)
313
+ ensure_datetime_column(df, time_column)
301
314
  entity_target = df.groupby(entity_column)[target_column].first()
302
315
  target_1_pct = float(entity_target.mean() * 100)
303
316
  target_1_is_minority = target_1_pct < 50
304
317
  entity_first = df.groupby(entity_column)[time_column].min()
305
318
  entity_last = df.groupby(entity_column)[time_column].max()
306
319
  tenure = (entity_last - entity_first).dt.days
307
- tenure_by_target = pd.DataFrame({"target": entity_target, "tenure": tenure})
320
+ tenure_by_target = native_pd.DataFrame({"target": entity_target, "tenure": tenure})
308
321
  retained_tenure = tenure_by_target[tenure_by_target["target"] == 1]["tenure"]
309
322
  churned_tenure = tenure_by_target[tenure_by_target["target"] == 0]["tenure"]
310
323
  retained_median_tenure = float(retained_tenure.median()) if len(retained_tenure) > 0 else None
@@ -436,6 +449,7 @@ def compare_recency_by_target(
436
449
  df = to_pandas(df)
437
450
  if target_column not in df.columns:
438
451
  return None
452
+ ensure_datetime_column(df, time_column)
439
453
  ref_date = reference_date or df[time_column].max()
440
454
  entity_last = df.groupby(entity_column)[time_column].max().reset_index()
441
455
  entity_last["recency_days"] = (ref_date - entity_last[time_column]).dt.days
@@ -502,7 +516,7 @@ class TemporalPatternAnalyzer:
502
516
  if len(df_clean) < 3:
503
517
  return self._unknown_trend()
504
518
 
505
- time_col = to_datetime(df_clean[self.time_column])
519
+ time_col = safe_to_datetime(df_clean[self.time_column])
506
520
  x = (time_col - time_col.min()).dt.total_seconds() / 86400
507
521
  y = df_clean[value_column].values
508
522
 
@@ -584,12 +598,13 @@ class TemporalPatternAnalyzer:
584
598
 
585
599
  def analyze_cohorts(self, df: DataFrame, entity_column: str, cohort_column: str, target_column: Optional[str] = None, period: str = "M") -> DataFrame:
586
600
  if len(df) == 0:
587
- return pd.DataFrame()
601
+ return native_pd.DataFrame()
588
602
 
589
603
  df_copy = to_pandas(df).copy()
604
+ ensure_datetime_column(df_copy, cohort_column)
590
605
  entity_first_event = df_copy.groupby(entity_column)[cohort_column].min()
591
606
  df_copy["_cohort"] = df_copy[entity_column].map(entity_first_event)
592
- df_copy["_cohort"] = to_datetime(df_copy["_cohort"]).dt.to_period(period)
607
+ df_copy["_cohort"] = df_copy["_cohort"].dt.to_period(period)
593
608
 
594
609
  entity_cohorts = df_copy.groupby(entity_column)["_cohort"].first().reset_index()
595
610
  entity_cohorts.columns = [entity_column, "_cohort"]
@@ -615,17 +630,16 @@ class TemporalPatternAnalyzer:
615
630
  return RecencyResult(avg_recency_days=0, median_recency_days=0, min_recency_days=0, max_recency_days=0)
616
631
 
617
632
  df = to_pandas(df)
633
+ ensure_datetime_column(df, self.time_column)
618
634
  ref_date = reference_date or Timestamp.now()
619
- to_datetime(df[self.time_column])
620
635
 
621
636
  entity_last = df.groupby(entity_column)[self.time_column].max()
622
- entity_last = to_datetime(entity_last)
623
637
  recency_days = (ref_date - entity_last).dt.days
624
638
 
625
639
  target_correlation = None
626
640
  if target_column and target_column in df.columns:
627
641
  entity_target = df.groupby(entity_column)[target_column].first()
628
- combined = pd.DataFrame({"recency": recency_days, "target": entity_target}).dropna()
642
+ combined = native_pd.DataFrame({"recency": recency_days, "target": entity_target}).dropna()
629
643
 
630
644
  if len(combined) > 2:
631
645
  corr, _ = stats.pearsonr(combined["recency"], combined["target"])
@@ -1,7 +1,7 @@
1
1
  from dataclasses import dataclass, field
2
2
  from typing import Optional
3
3
 
4
- from customer_retention.core.compat import DataFrame, Timestamp, to_datetime, to_pandas
4
+ from customer_retention.core.compat import DataFrame, Timestamp, ensure_datetime_column, safe_to_datetime, to_pandas
5
5
  from customer_retention.core.components.enums import Severity
6
6
 
7
7
 
@@ -38,6 +38,7 @@ class DuplicateEventCheck(TemporalQualityCheck):
38
38
  self.time_column = time_column
39
39
 
40
40
  def run(self, df: DataFrame) -> TemporalQualityResult:
41
+ df = to_pandas(df)
41
42
  if len(df) == 0:
42
43
  return self._pass_result("No data to check")
43
44
 
@@ -70,11 +71,12 @@ class TemporalGapCheck(TemporalQualityCheck):
70
71
  self.max_gap_multiple = max_gap_multiple
71
72
 
72
73
  def run(self, df: DataFrame) -> TemporalQualityResult:
74
+ df = to_pandas(df)
73
75
  if len(df) < 2:
74
76
  return self._pass_result("Insufficient data to check gaps")
75
77
 
76
- df = to_pandas(df)
77
- time_col = to_datetime(df.sort_values(self.time_column)[self.time_column])
78
+ ensure_datetime_column(df, self.time_column)
79
+ time_col = df.sort_values(self.time_column)[self.time_column]
78
80
  diffs_days = time_col.diff().dropna().dt.total_seconds() / 86400
79
81
  expected_days = self.FREQ_TO_DAYS.get(self.expected_frequency, 1)
80
82
  threshold_days = expected_days * self.max_gap_multiple
@@ -108,10 +110,11 @@ class FutureDateCheck(TemporalQualityCheck):
108
110
  self.reference_date = reference_date or Timestamp.now()
109
111
 
110
112
  def run(self, df: DataFrame) -> TemporalQualityResult:
113
+ df = to_pandas(df)
111
114
  if len(df) == 0:
112
115
  return self._pass_result("No data to check")
113
116
 
114
- time_col = to_datetime(df[self.time_column])
117
+ time_col = safe_to_datetime(df[self.time_column])
115
118
  future_mask = time_col > self.reference_date
116
119
  future_count = future_mask.sum()
117
120
 
@@ -138,10 +141,11 @@ class EventOrderCheck(TemporalQualityCheck):
138
141
  self.time_column = time_column
139
142
 
140
143
  def run(self, df: DataFrame) -> TemporalQualityResult:
144
+ df = to_pandas(df)
141
145
  if len(df) < 2:
142
146
  return self._pass_result("Insufficient data to check ordering")
143
147
 
144
- df_check = df.assign(_parsed_time=to_datetime(df[self.time_column]))
148
+ df_check = df.assign(_parsed_time=safe_to_datetime(df[self.time_column]))
145
149
  collision_counts = df_check.groupby([self.entity_column, "_parsed_time"]).size()
146
150
  ambiguous = collision_counts[collision_counts > 1]
147
151
  ambiguous_count = ambiguous.sum() - len(ambiguous)