churnkit 0.75.1a2__py3-none-any.whl → 0.76.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.75.1a2.dist-info → churnkit-0.76.0a1.dist-info}/METADATA +5 -2
- {churnkit-0.75.1a2.dist-info → churnkit-0.76.0a1.dist-info}/RECORD +48 -47
- customer_retention/__init__.py +11 -1
- customer_retention/analysis/visualization/chart_builder.py +6 -7
- customer_retention/core/compat/__init__.py +53 -0
- customer_retention/core/config/__init__.py +43 -8
- customer_retention/core/config/experiments.py +20 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +2 -1
- customer_retention/generators/pipeline_generator/renderer.py +7 -5
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +223 -149
- customer_retention/integrations/adapters/factory.py +8 -5
- customer_retention/integrations/adapters/feature_store/base.py +1 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +58 -10
- customer_retention/integrations/adapters/mlflow/base.py +8 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +15 -2
- customer_retention/integrations/adapters/mlflow/local.py +7 -0
- customer_retention/integrations/databricks_init.py +141 -0
- customer_retention/stages/features/temporal_features.py +12 -12
- customer_retention/stages/profiling/pattern_analysis_config.py +4 -3
- customer_retention/stages/profiling/temporal_feature_analyzer.py +5 -5
- customer_retention/stages/profiling/temporal_feature_engineer.py +2 -2
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +22 -8
- customer_retention/stages/profiling/temporal_quality_checks.py +9 -5
- customer_retention/stages/profiling/time_series_profiler.py +9 -9
- customer_retention/stages/profiling/time_window_aggregator.py +7 -4
- customer_retention/stages/transformation/datetime_transformer.py +10 -2
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +0 -0
- {churnkit-0.75.1a2.dist-info → churnkit-0.76.0a1.dist-info}/WHEEL +0 -0
- {churnkit-0.75.1a2.dist-info → churnkit-0.76.0a1.dist-info}/entry_points.txt +0 -0
- {churnkit-0.75.1a2.dist-info → churnkit-0.76.0a1.dist-info}/licenses/LICENSE +0 -0
|
@@ -8,6 +8,24 @@ from ..base import AdapterResult
|
|
|
8
8
|
from .base import FeatureStoreAdapter, FeatureViewConfig
|
|
9
9
|
|
|
10
10
|
|
|
11
|
+
def _import_feature_engineering_client() -> Any:
|
|
12
|
+
try:
|
|
13
|
+
from databricks.feature_engineering import FeatureEngineeringClient
|
|
14
|
+
|
|
15
|
+
return FeatureEngineeringClient
|
|
16
|
+
except ImportError:
|
|
17
|
+
from databricks.feature_store import FeatureStoreClient
|
|
18
|
+
|
|
19
|
+
return FeatureStoreClient
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _validate_write_mode(mode: str) -> None:
|
|
23
|
+
if mode == "overwrite":
|
|
24
|
+
raise ValueError(
|
|
25
|
+
"FeatureEngineeringClient.write_table only supports mode='merge'. Use mode='merge' instead of 'overwrite'."
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
11
29
|
class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
12
30
|
def __init__(self, catalog: str = "main", schema: str = "default"):
|
|
13
31
|
if not is_spark_available():
|
|
@@ -19,27 +37,46 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
|
19
37
|
@property
|
|
20
38
|
def fe_client(self) -> Any:
|
|
21
39
|
if self._fe_client is None:
|
|
22
|
-
|
|
23
|
-
self._fe_client =
|
|
40
|
+
client_cls = _import_feature_engineering_client()
|
|
41
|
+
self._fe_client = client_cls()
|
|
24
42
|
return self._fe_client
|
|
25
43
|
|
|
26
44
|
def _full_name(self, name: str) -> str:
|
|
27
45
|
return f"{self.catalog}.{self.schema}.{name}"
|
|
28
46
|
|
|
29
|
-
def create_table(
|
|
47
|
+
def create_table(
|
|
48
|
+
self, name: str, schema: Dict[str, str], primary_keys: List[str], timeseries_column: Optional[str] = None
|
|
49
|
+
) -> AdapterResult:
|
|
30
50
|
full_name = self._full_name(name)
|
|
31
51
|
spark = get_spark_session()
|
|
32
52
|
df = spark.createDataFrame([], self._schema_to_spark(schema))
|
|
33
|
-
|
|
53
|
+
kwargs: Dict[str, Any] = {"name": full_name, "primary_keys": primary_keys, "df": df}
|
|
54
|
+
if timeseries_column:
|
|
55
|
+
kwargs["timeseries_columns"] = [timeseries_column]
|
|
56
|
+
self.fe_client.create_table(**kwargs)
|
|
34
57
|
return AdapterResult(success=True, metadata={"name": full_name})
|
|
35
58
|
|
|
36
59
|
def _schema_to_spark(self, schema: Dict[str, str]) -> Any:
|
|
37
|
-
from pyspark.sql.types import
|
|
38
|
-
|
|
60
|
+
from pyspark.sql.types import (
|
|
61
|
+
FloatType,
|
|
62
|
+
IntegerType,
|
|
63
|
+
StringType,
|
|
64
|
+
StructField,
|
|
65
|
+
StructType,
|
|
66
|
+
TimestampType,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
type_map = {
|
|
70
|
+
"int": IntegerType(),
|
|
71
|
+
"float": FloatType(),
|
|
72
|
+
"string": StringType(),
|
|
73
|
+
"timestamp": TimestampType(),
|
|
74
|
+
}
|
|
39
75
|
fields = [StructField(name, type_map.get(dtype, StringType()), True) for name, dtype in schema.items()]
|
|
40
76
|
return StructType(fields)
|
|
41
77
|
|
|
42
78
|
def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
|
|
79
|
+
_validate_write_mode(mode)
|
|
43
80
|
full_name = self._full_name(name)
|
|
44
81
|
spark = get_spark_session()
|
|
45
82
|
spark_df = spark.createDataFrame(df)
|
|
@@ -72,14 +109,22 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
|
72
109
|
table_name = self._full_name(config.name)
|
|
73
110
|
spark = get_spark_session()
|
|
74
111
|
spark_df = spark.createDataFrame(df)
|
|
75
|
-
|
|
112
|
+
kwargs: Dict[str, Any] = {"name": table_name, "primary_keys": [config.entity_key], "df": spark_df}
|
|
113
|
+
if hasattr(config, "timeseries_column") and config.timeseries_column:
|
|
114
|
+
kwargs["timeseries_columns"] = [config.timeseries_column]
|
|
115
|
+
self.fe_client.create_table(**kwargs)
|
|
76
116
|
return table_name
|
|
77
117
|
|
|
78
118
|
def get_historical_features(self, entity_df: pd.DataFrame, feature_refs: List[str]) -> pd.DataFrame:
|
|
79
119
|
from databricks.feature_engineering import FeatureLookup
|
|
120
|
+
|
|
80
121
|
spark = get_spark_session()
|
|
81
|
-
lookups = [
|
|
82
|
-
|
|
122
|
+
lookups = [
|
|
123
|
+
FeatureLookup(table_name=ref.split(":")[0], lookup_key=[entity_df.columns[0]]) for ref in feature_refs
|
|
124
|
+
]
|
|
125
|
+
training_set = self.fe_client.create_training_set(
|
|
126
|
+
df=spark.createDataFrame(entity_df), feature_lookups=lookups, label=None
|
|
127
|
+
)
|
|
83
128
|
return training_set.load_df().toPandas()
|
|
84
129
|
|
|
85
130
|
def materialize(self, feature_views: List[str], start_date: str, end_date: str) -> None:
|
|
@@ -89,6 +134,9 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
|
89
134
|
entity_df = pd.DataFrame(entity_keys)
|
|
90
135
|
spark = get_spark_session()
|
|
91
136
|
from databricks.feature_engineering import FeatureLookup
|
|
92
|
-
|
|
137
|
+
|
|
138
|
+
lookups = [
|
|
139
|
+
FeatureLookup(table_name=ref.split(":")[0], lookup_key=list(entity_keys.keys())) for ref in feature_refs
|
|
140
|
+
]
|
|
93
141
|
result = self.fe_client.score_batch(df=spark.createDataFrame(entity_df), feature_lookups=lookups)
|
|
94
142
|
return result.toPandas().to_dict()
|
|
@@ -30,3 +30,11 @@ class MLflowAdapter(ABC):
|
|
|
30
30
|
@abstractmethod
|
|
31
31
|
def transition_stage(self, model_name: str, version: str, stage: str) -> None:
|
|
32
32
|
pass
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def set_alias(self, model_name: str, alias: str, version: str) -> None:
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def get_model_by_alias(self, model_name: str, alias: str) -> Any:
|
|
40
|
+
pass
|
|
@@ -7,9 +7,12 @@ from .base import MLflowAdapter
|
|
|
7
7
|
try:
|
|
8
8
|
import mlflow
|
|
9
9
|
from mlflow.tracking import MlflowClient
|
|
10
|
+
|
|
10
11
|
MLFLOW_AVAILABLE = True
|
|
12
|
+
MLFLOW_MAJOR_VERSION = int(mlflow.__version__.split(".")[0])
|
|
11
13
|
except ImportError:
|
|
12
14
|
MLFLOW_AVAILABLE = False
|
|
15
|
+
MLFLOW_MAJOR_VERSION = 0
|
|
13
16
|
|
|
14
17
|
|
|
15
18
|
class DatabricksMLflow(MLflowAdapter):
|
|
@@ -18,7 +21,8 @@ class DatabricksMLflow(MLflowAdapter):
|
|
|
18
21
|
raise ImportError("PySpark required for DatabricksMLflow")
|
|
19
22
|
if not MLFLOW_AVAILABLE:
|
|
20
23
|
raise ImportError("mlflow package required")
|
|
21
|
-
|
|
24
|
+
if MLFLOW_MAJOR_VERSION < 3:
|
|
25
|
+
mlflow.set_registry_uri(registry_uri)
|
|
22
26
|
self.registry_uri = registry_uri
|
|
23
27
|
self._client = MlflowClient()
|
|
24
28
|
self._run_id = None
|
|
@@ -44,7 +48,10 @@ class DatabricksMLflow(MLflowAdapter):
|
|
|
44
48
|
mlflow.log_metrics(metrics)
|
|
45
49
|
|
|
46
50
|
def log_model(self, model: Any, artifact_path: str, registered_name: Optional[str] = None) -> str:
|
|
47
|
-
|
|
51
|
+
if MLFLOW_MAJOR_VERSION >= 3:
|
|
52
|
+
info = mlflow.sklearn.log_model(model, name=artifact_path, registered_model_name=registered_name)
|
|
53
|
+
else:
|
|
54
|
+
info = mlflow.sklearn.log_model(model, artifact_path, registered_model_name=registered_name)
|
|
48
55
|
return info.model_uri
|
|
49
56
|
|
|
50
57
|
def load_model(self, model_uri: str) -> Any:
|
|
@@ -52,3 +59,9 @@ class DatabricksMLflow(MLflowAdapter):
|
|
|
52
59
|
|
|
53
60
|
def transition_stage(self, model_name: str, version: str, stage: str) -> None:
|
|
54
61
|
self._client.set_model_version_tag(name=model_name, version=version, key="stage", value=stage)
|
|
62
|
+
|
|
63
|
+
def set_alias(self, model_name: str, alias: str, version: str) -> None:
|
|
64
|
+
self._client.set_registered_model_alias(name=model_name, alias=alias, version=version)
|
|
65
|
+
|
|
66
|
+
def get_model_by_alias(self, model_name: str, alias: str) -> Any:
|
|
67
|
+
return self._client.get_model_version_by_alias(name=model_name, alias=alias)
|
|
@@ -5,6 +5,7 @@ from .base import MLflowAdapter
|
|
|
5
5
|
try:
|
|
6
6
|
import mlflow
|
|
7
7
|
from mlflow.tracking import MlflowClient
|
|
8
|
+
|
|
8
9
|
MLFLOW_AVAILABLE = True
|
|
9
10
|
except ImportError:
|
|
10
11
|
MLFLOW_AVAILABLE = False
|
|
@@ -48,3 +49,9 @@ class LocalMLflow(MLflowAdapter):
|
|
|
48
49
|
|
|
49
50
|
def transition_stage(self, model_name: str, version: str, stage: str) -> None:
|
|
50
51
|
self._client.transition_model_version_stage(name=model_name, version=version, stage=stage)
|
|
52
|
+
|
|
53
|
+
def set_alias(self, model_name: str, alias: str, version: str) -> None:
|
|
54
|
+
self._client.set_registered_model_alias(name=model_name, alias=alias, version=version)
|
|
55
|
+
|
|
56
|
+
def get_model_by_alias(self, model_name: str, alias: str) -> Any:
|
|
57
|
+
return self._client.get_model_version_by_alias(name=model_name, alias=alias)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class DatabricksInitResult:
|
|
12
|
+
catalog: str
|
|
13
|
+
schema: str
|
|
14
|
+
experiment_name: str
|
|
15
|
+
workspace_path: str | None
|
|
16
|
+
model_name: str
|
|
17
|
+
notebooks_copied: list[str] = field(default_factory=list)
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def environment_variables(self) -> dict[str, str]:
|
|
21
|
+
env_vars = {
|
|
22
|
+
"CR_CATALOG": self.catalog,
|
|
23
|
+
"CR_SCHEMA": self.schema,
|
|
24
|
+
"CR_EXPERIMENT_NAME": self.experiment_name,
|
|
25
|
+
"CR_EXPERIMENTS_DIR": f"/Workspace/{self.workspace_path}/experiments" if self.workspace_path else "",
|
|
26
|
+
}
|
|
27
|
+
if self.workspace_path:
|
|
28
|
+
env_vars["CR_WORKSPACE_PATH"] = self.workspace_path
|
|
29
|
+
return env_vars
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def databricks_init(
|
|
33
|
+
catalog: str = "main",
|
|
34
|
+
schema: str = "default",
|
|
35
|
+
experiment_name: str | None = None,
|
|
36
|
+
workspace_path: str | None = None,
|
|
37
|
+
copy_notebooks: bool = True,
|
|
38
|
+
model_name: str = "customer_retention",
|
|
39
|
+
) -> DatabricksInitResult:
|
|
40
|
+
_validate_databricks_environment()
|
|
41
|
+
_set_environment_variables(catalog, schema, workspace_path)
|
|
42
|
+
resolved_experiment_name = experiment_name or _resolve_experiment_name_from_notebook_path()
|
|
43
|
+
_set_experiment_name_env_var(resolved_experiment_name)
|
|
44
|
+
_configure_mlflow_experiment(resolved_experiment_name)
|
|
45
|
+
notebooks_copied: list[str] = []
|
|
46
|
+
if copy_notebooks and workspace_path:
|
|
47
|
+
notebooks_copied = _copy_exploration_notebooks(workspace_path)
|
|
48
|
+
result = DatabricksInitResult(
|
|
49
|
+
catalog=catalog,
|
|
50
|
+
schema=schema,
|
|
51
|
+
experiment_name=resolved_experiment_name,
|
|
52
|
+
workspace_path=workspace_path,
|
|
53
|
+
model_name=model_name,
|
|
54
|
+
notebooks_copied=notebooks_copied,
|
|
55
|
+
)
|
|
56
|
+
_display_init_summary(result)
|
|
57
|
+
return result
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _validate_databricks_environment() -> None:
|
|
61
|
+
if not os.environ.get("DATABRICKS_RUNTIME_VERSION"):
|
|
62
|
+
raise RuntimeError(
|
|
63
|
+
"databricks_init() must be called from a Databricks notebook. "
|
|
64
|
+
"DATABRICKS_RUNTIME_VERSION not found in environment."
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _set_environment_variables(catalog: str, schema: str, workspace_path: str | None) -> None:
|
|
69
|
+
os.environ["CR_CATALOG"] = catalog
|
|
70
|
+
os.environ["CR_SCHEMA"] = schema
|
|
71
|
+
if workspace_path:
|
|
72
|
+
os.environ["CR_WORKSPACE_PATH"] = workspace_path
|
|
73
|
+
os.environ["CR_EXPERIMENTS_DIR"] = f"/Workspace/{workspace_path}/experiments"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _set_experiment_name_env_var(experiment_name: str) -> None:
|
|
77
|
+
os.environ["CR_EXPERIMENT_NAME"] = experiment_name
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _resolve_experiment_name_from_notebook_path() -> str:
|
|
81
|
+
try:
|
|
82
|
+
dbutils = _get_dbutils()
|
|
83
|
+
if dbutils:
|
|
84
|
+
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
|
|
85
|
+
return notebook_path.rsplit("/", 1)[-1]
|
|
86
|
+
except Exception:
|
|
87
|
+
pass
|
|
88
|
+
return "customer_retention"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _get_dbutils() -> Any | None:
|
|
92
|
+
try:
|
|
93
|
+
from customer_retention.core.compat.detection import get_dbutils
|
|
94
|
+
|
|
95
|
+
return get_dbutils()
|
|
96
|
+
except Exception:
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _configure_mlflow_experiment(experiment_name: str) -> None:
|
|
101
|
+
try:
|
|
102
|
+
import mlflow
|
|
103
|
+
|
|
104
|
+
mlflow.set_experiment(experiment_name)
|
|
105
|
+
except ImportError:
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _copy_exploration_notebooks(workspace_path: str) -> list[str]:
|
|
110
|
+
from customer_retention.generators.notebook_generator.project_init import ProjectInitializer
|
|
111
|
+
|
|
112
|
+
source_dir = ProjectInitializer(project_name="")._get_exploration_source_dir()
|
|
113
|
+
if not source_dir or not source_dir.exists():
|
|
114
|
+
return []
|
|
115
|
+
|
|
116
|
+
dest_dir = Path(f"/Workspace/{workspace_path}/exploration_notebooks")
|
|
117
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
118
|
+
|
|
119
|
+
copied = []
|
|
120
|
+
for notebook in source_dir.glob("*.ipynb"):
|
|
121
|
+
dest_path = dest_dir / notebook.name
|
|
122
|
+
if not dest_path.exists():
|
|
123
|
+
shutil.copy2(notebook, dest_path)
|
|
124
|
+
copied.append(str(dest_path))
|
|
125
|
+
|
|
126
|
+
return copied
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _display_init_summary(result: DatabricksInitResult) -> None:
|
|
130
|
+
print("ChurnKit Databricks Initialization Complete")
|
|
131
|
+
print("=" * 45)
|
|
132
|
+
print(f" Catalog: {result.catalog}")
|
|
133
|
+
print(f" Schema: {result.schema}")
|
|
134
|
+
print(f" Experiment: {result.experiment_name}")
|
|
135
|
+
print(f" Workspace Path: {result.workspace_path or '(not set)'}")
|
|
136
|
+
print(f" Model Name: {result.model_name}")
|
|
137
|
+
if result.notebooks_copied:
|
|
138
|
+
print(f" Notebooks Copied: {len(result.notebooks_copied)}")
|
|
139
|
+
for nb in result.notebooks_copied:
|
|
140
|
+
print(f" - {nb}")
|
|
141
|
+
print("=" * 45)
|
|
@@ -10,7 +10,7 @@ from dataclasses import dataclass, field
|
|
|
10
10
|
from enum import Enum
|
|
11
11
|
from typing import List, Optional, Union
|
|
12
12
|
|
|
13
|
-
from customer_retention.core.compat import DataFrame, Series, Timedelta, Timestamp, pd
|
|
13
|
+
from customer_retention.core.compat import DataFrame, Series, Timedelta, Timestamp, pd, safe_to_datetime, to_pandas
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class ReferenceDateSource(Enum):
|
|
@@ -122,19 +122,19 @@ class TemporalFeatureGenerator:
|
|
|
122
122
|
if not self._is_fitted:
|
|
123
123
|
raise ValueError("Generator not fitted. Call fit() first.")
|
|
124
124
|
|
|
125
|
-
result = df.copy()
|
|
125
|
+
result = to_pandas(df).copy()
|
|
126
126
|
self.generated_features = []
|
|
127
127
|
warnings_list = []
|
|
128
128
|
|
|
129
129
|
# Get reference date(s) for this transform
|
|
130
130
|
if self.reference_date_source in [ReferenceDateSource.COLUMN, ReferenceDateSource.FEATURE_TIMESTAMP]:
|
|
131
|
-
ref_dates =
|
|
131
|
+
ref_dates = safe_to_datetime(df[self.reference_date_column])
|
|
132
132
|
else:
|
|
133
133
|
ref_dates = self.reference_date
|
|
134
134
|
|
|
135
135
|
# Tenure features
|
|
136
136
|
if self.created_column and self.created_column in df.columns:
|
|
137
|
-
created =
|
|
137
|
+
created = safe_to_datetime(df[self.created_column])
|
|
138
138
|
tenure_days = self._compute_days_diff(ref_dates, created)
|
|
139
139
|
result["tenure_days"] = tenure_days
|
|
140
140
|
self.generated_features.append("tenure_days")
|
|
@@ -154,7 +154,7 @@ class TemporalFeatureGenerator:
|
|
|
154
154
|
|
|
155
155
|
# Recency features
|
|
156
156
|
if self.last_order_column and self.last_order_column in df.columns:
|
|
157
|
-
last_order =
|
|
157
|
+
last_order = safe_to_datetime(df[self.last_order_column])
|
|
158
158
|
days_since_last = self._compute_days_diff(ref_dates, last_order)
|
|
159
159
|
result["days_since_last_order"] = days_since_last
|
|
160
160
|
self.generated_features.append("days_since_last_order")
|
|
@@ -162,8 +162,8 @@ class TemporalFeatureGenerator:
|
|
|
162
162
|
# Activation features
|
|
163
163
|
if (self.first_order_column and self.first_order_column in df.columns and
|
|
164
164
|
self.created_column and self.created_column in df.columns):
|
|
165
|
-
created =
|
|
166
|
-
first_order =
|
|
165
|
+
created = safe_to_datetime(df[self.created_column])
|
|
166
|
+
first_order = safe_to_datetime(df[self.first_order_column])
|
|
167
167
|
days_to_first = self._compute_days_diff(first_order, created)
|
|
168
168
|
result["days_to_first_order"] = days_to_first
|
|
169
169
|
self.generated_features.append("days_to_first_order")
|
|
@@ -171,8 +171,8 @@ class TemporalFeatureGenerator:
|
|
|
171
171
|
# Active period
|
|
172
172
|
if (self.first_order_column and self.first_order_column in df.columns and
|
|
173
173
|
self.last_order_column and self.last_order_column in df.columns):
|
|
174
|
-
first_order =
|
|
175
|
-
last_order =
|
|
174
|
+
first_order = safe_to_datetime(df[self.first_order_column])
|
|
175
|
+
last_order = safe_to_datetime(df[self.last_order_column])
|
|
176
176
|
active_period = self._compute_days_diff(last_order, first_order)
|
|
177
177
|
result["active_period_days"] = active_period
|
|
178
178
|
self.generated_features.append("active_period_days")
|
|
@@ -210,21 +210,21 @@ class TemporalFeatureGenerator:
|
|
|
210
210
|
raise ValueError(
|
|
211
211
|
"date_column must be provided when source is MAX_DATE"
|
|
212
212
|
)
|
|
213
|
-
self.reference_date =
|
|
213
|
+
self.reference_date = safe_to_datetime(df[self.date_column]).max()
|
|
214
214
|
|
|
215
215
|
elif self.reference_date_source == ReferenceDateSource.COLUMN:
|
|
216
216
|
if self.reference_date_column is None:
|
|
217
217
|
raise ValueError(
|
|
218
218
|
"reference_date_column must be provided when source is COLUMN"
|
|
219
219
|
)
|
|
220
|
-
self.reference_date =
|
|
220
|
+
self.reference_date = safe_to_datetime(df[self.reference_date_column])
|
|
221
221
|
|
|
222
222
|
elif self.reference_date_source == ReferenceDateSource.FEATURE_TIMESTAMP:
|
|
223
223
|
if "feature_timestamp" not in df.columns:
|
|
224
224
|
raise ValueError(
|
|
225
225
|
"feature_timestamp column required when source is FEATURE_TIMESTAMP"
|
|
226
226
|
)
|
|
227
|
-
self.reference_date =
|
|
227
|
+
self.reference_date = safe_to_datetime(df["feature_timestamp"])
|
|
228
228
|
self.reference_date_column = "feature_timestamp"
|
|
229
229
|
|
|
230
230
|
def _compute_days_diff(
|
|
@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
-
from customer_retention.core.compat import DataFrame
|
|
7
|
+
from customer_retention.core.compat import DataFrame, ensure_datetime_column, to_pandas
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@dataclass
|
|
@@ -216,12 +216,13 @@ class SparklineDataBuilder:
|
|
|
216
216
|
self.freq = freq
|
|
217
217
|
|
|
218
218
|
def build(self, df: DataFrame, columns: List[str]) -> Tuple[List[SparklineData], bool]:
|
|
219
|
-
|
|
219
|
+
df = to_pandas(df)
|
|
220
220
|
has_target = self.target_column is not None and self.target_column in df.columns
|
|
221
221
|
if has_target:
|
|
222
222
|
validate_not_event_level(df, self.entity_column, self.target_column)
|
|
223
223
|
df_work = self._prepare_working_df(df, has_target)
|
|
224
|
-
df_work
|
|
224
|
+
ensure_datetime_column(df_work, self.time_column)
|
|
225
|
+
df_work['_period'] = df_work[self.time_column].dt.to_period(self.freq).dt.start_time
|
|
225
226
|
results = [self._build_sparkline_for_column(df_work, col, has_target)
|
|
226
227
|
for col in columns if col in df_work.columns]
|
|
227
228
|
return results, has_target
|
|
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
from scipy import stats
|
|
7
7
|
|
|
8
|
-
from customer_retention.core.compat import DataFrame, pd, qcut,
|
|
8
|
+
from customer_retention.core.compat import DataFrame, ensure_datetime_column, native_pd, pd, qcut, to_pandas
|
|
9
9
|
from customer_retention.core.utils import compute_effect_size
|
|
10
10
|
|
|
11
11
|
|
|
@@ -626,8 +626,8 @@ class TemporalFeatureAnalyzer:
|
|
|
626
626
|
return next_priority
|
|
627
627
|
|
|
628
628
|
def _prepare_dataframe(self, df: DataFrame) -> DataFrame:
|
|
629
|
-
df = df.copy()
|
|
630
|
-
df
|
|
629
|
+
df = to_pandas(df).copy()
|
|
630
|
+
ensure_datetime_column(df, self.time_column)
|
|
631
631
|
return df
|
|
632
632
|
|
|
633
633
|
def _validate_event_level_target_usage(self, df: DataFrame, target_column: Optional[str]) -> None:
|
|
@@ -642,7 +642,7 @@ class TemporalFeatureAnalyzer:
|
|
|
642
642
|
)
|
|
643
643
|
|
|
644
644
|
def _calculate_iv(self, feature: pd.Series, target: pd.Series, bins: int = 10) -> float:
|
|
645
|
-
df_iv =
|
|
645
|
+
df_iv = native_pd.DataFrame({"feature": feature, "target": target}).dropna()
|
|
646
646
|
if len(df_iv) < bins * 2:
|
|
647
647
|
return 0.0
|
|
648
648
|
try:
|
|
@@ -666,7 +666,7 @@ class TemporalFeatureAnalyzer:
|
|
|
666
666
|
return float(grouped["iv"].sum())
|
|
667
667
|
|
|
668
668
|
def _calculate_ks(self, feature: pd.Series, target: pd.Series) -> Tuple[float, float]:
|
|
669
|
-
df_ks =
|
|
669
|
+
df_ks = native_pd.DataFrame({"feature": feature, "target": target}).dropna()
|
|
670
670
|
group0, group1 = df_ks[df_ks["target"] == 0]["feature"], df_ks[df_ks["target"] == 1]["feature"]
|
|
671
671
|
if len(group0) == 0 or len(group1) == 0:
|
|
672
672
|
return 0.0, 1.0
|
|
@@ -25,7 +25,7 @@ from typing import Any, Dict, List, Optional
|
|
|
25
25
|
|
|
26
26
|
import numpy as np
|
|
27
27
|
|
|
28
|
-
from customer_retention.core.compat import Timedelta, pd, to_datetime, to_pandas
|
|
28
|
+
from customer_retention.core.compat import Timedelta, native_pd, pd, to_datetime, to_pandas
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class ReferenceMode(Enum):
|
|
@@ -307,7 +307,7 @@ class TemporalFeatureEngineer:
|
|
|
307
307
|
|
|
308
308
|
if self.config.reference_mode == ReferenceMode.GLOBAL_DATE:
|
|
309
309
|
ref_date = self.config.global_reference_date or datetime.now()
|
|
310
|
-
return
|
|
310
|
+
return native_pd.DataFrame({
|
|
311
311
|
entity_col: entities,
|
|
312
312
|
"reference_date": ref_date,
|
|
313
313
|
})
|
|
@@ -5,7 +5,16 @@ from typing import Dict, List, Optional, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
from scipy import stats
|
|
7
7
|
|
|
8
|
-
from customer_retention.core.compat import
|
|
8
|
+
from customer_retention.core.compat import (
|
|
9
|
+
DataFrame,
|
|
10
|
+
Timestamp,
|
|
11
|
+
cut,
|
|
12
|
+
ensure_datetime_column,
|
|
13
|
+
native_pd,
|
|
14
|
+
pd,
|
|
15
|
+
safe_to_datetime,
|
|
16
|
+
to_pandas,
|
|
17
|
+
)
|
|
9
18
|
from customer_retention.core.utils import compute_effect_size
|
|
10
19
|
|
|
11
20
|
|
|
@@ -177,6 +186,8 @@ def generate_trend_recommendations(trend: TrendResult, mean_value: float = 1.0)
|
|
|
177
186
|
|
|
178
187
|
|
|
179
188
|
def analyze_cohort_distribution(first_events: DataFrame, time_column: str) -> CohortDistribution:
|
|
189
|
+
first_events = to_pandas(first_events)
|
|
190
|
+
ensure_datetime_column(first_events, time_column)
|
|
180
191
|
years = first_events[time_column].dt.year
|
|
181
192
|
year_counts = years.value_counts().sort_index().to_dict()
|
|
182
193
|
total = len(first_events)
|
|
@@ -232,6 +243,7 @@ def compute_recency_buckets(
|
|
|
232
243
|
reference_date: Timestamp, bucket_edges: Optional[List[float]] = None
|
|
233
244
|
) -> List[RecencyBucketStats]:
|
|
234
245
|
df = to_pandas(df)
|
|
246
|
+
ensure_datetime_column(df, time_column)
|
|
235
247
|
edges = bucket_edges or DEFAULT_BUCKET_EDGES
|
|
236
248
|
labels = _generate_bucket_labels(edges)
|
|
237
249
|
entity_last = df.groupby(entity_column)[time_column].max().reset_index()
|
|
@@ -298,13 +310,14 @@ def _diagnose_anomaly_pattern(
|
|
|
298
310
|
df: DataFrame, entity_column: str, time_column: str, target_column: str
|
|
299
311
|
) -> AnomalyDiagnostics:
|
|
300
312
|
df = to_pandas(df)
|
|
313
|
+
ensure_datetime_column(df, time_column)
|
|
301
314
|
entity_target = df.groupby(entity_column)[target_column].first()
|
|
302
315
|
target_1_pct = float(entity_target.mean() * 100)
|
|
303
316
|
target_1_is_minority = target_1_pct < 50
|
|
304
317
|
entity_first = df.groupby(entity_column)[time_column].min()
|
|
305
318
|
entity_last = df.groupby(entity_column)[time_column].max()
|
|
306
319
|
tenure = (entity_last - entity_first).dt.days
|
|
307
|
-
tenure_by_target =
|
|
320
|
+
tenure_by_target = native_pd.DataFrame({"target": entity_target, "tenure": tenure})
|
|
308
321
|
retained_tenure = tenure_by_target[tenure_by_target["target"] == 1]["tenure"]
|
|
309
322
|
churned_tenure = tenure_by_target[tenure_by_target["target"] == 0]["tenure"]
|
|
310
323
|
retained_median_tenure = float(retained_tenure.median()) if len(retained_tenure) > 0 else None
|
|
@@ -436,6 +449,7 @@ def compare_recency_by_target(
|
|
|
436
449
|
df = to_pandas(df)
|
|
437
450
|
if target_column not in df.columns:
|
|
438
451
|
return None
|
|
452
|
+
ensure_datetime_column(df, time_column)
|
|
439
453
|
ref_date = reference_date or df[time_column].max()
|
|
440
454
|
entity_last = df.groupby(entity_column)[time_column].max().reset_index()
|
|
441
455
|
entity_last["recency_days"] = (ref_date - entity_last[time_column]).dt.days
|
|
@@ -502,7 +516,7 @@ class TemporalPatternAnalyzer:
|
|
|
502
516
|
if len(df_clean) < 3:
|
|
503
517
|
return self._unknown_trend()
|
|
504
518
|
|
|
505
|
-
time_col =
|
|
519
|
+
time_col = safe_to_datetime(df_clean[self.time_column])
|
|
506
520
|
x = (time_col - time_col.min()).dt.total_seconds() / 86400
|
|
507
521
|
y = df_clean[value_column].values
|
|
508
522
|
|
|
@@ -584,12 +598,13 @@ class TemporalPatternAnalyzer:
|
|
|
584
598
|
|
|
585
599
|
def analyze_cohorts(self, df: DataFrame, entity_column: str, cohort_column: str, target_column: Optional[str] = None, period: str = "M") -> DataFrame:
|
|
586
600
|
if len(df) == 0:
|
|
587
|
-
return
|
|
601
|
+
return native_pd.DataFrame()
|
|
588
602
|
|
|
589
603
|
df_copy = to_pandas(df).copy()
|
|
604
|
+
ensure_datetime_column(df_copy, cohort_column)
|
|
590
605
|
entity_first_event = df_copy.groupby(entity_column)[cohort_column].min()
|
|
591
606
|
df_copy["_cohort"] = df_copy[entity_column].map(entity_first_event)
|
|
592
|
-
df_copy["_cohort"] =
|
|
607
|
+
df_copy["_cohort"] = df_copy["_cohort"].dt.to_period(period)
|
|
593
608
|
|
|
594
609
|
entity_cohorts = df_copy.groupby(entity_column)["_cohort"].first().reset_index()
|
|
595
610
|
entity_cohorts.columns = [entity_column, "_cohort"]
|
|
@@ -615,17 +630,16 @@ class TemporalPatternAnalyzer:
|
|
|
615
630
|
return RecencyResult(avg_recency_days=0, median_recency_days=0, min_recency_days=0, max_recency_days=0)
|
|
616
631
|
|
|
617
632
|
df = to_pandas(df)
|
|
633
|
+
ensure_datetime_column(df, self.time_column)
|
|
618
634
|
ref_date = reference_date or Timestamp.now()
|
|
619
|
-
to_datetime(df[self.time_column])
|
|
620
635
|
|
|
621
636
|
entity_last = df.groupby(entity_column)[self.time_column].max()
|
|
622
|
-
entity_last = to_datetime(entity_last)
|
|
623
637
|
recency_days = (ref_date - entity_last).dt.days
|
|
624
638
|
|
|
625
639
|
target_correlation = None
|
|
626
640
|
if target_column and target_column in df.columns:
|
|
627
641
|
entity_target = df.groupby(entity_column)[target_column].first()
|
|
628
|
-
combined =
|
|
642
|
+
combined = native_pd.DataFrame({"recency": recency_days, "target": entity_target}).dropna()
|
|
629
643
|
|
|
630
644
|
if len(combined) > 2:
|
|
631
645
|
corr, _ = stats.pearsonr(combined["recency"], combined["target"])
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
|
-
from customer_retention.core.compat import DataFrame, Timestamp,
|
|
4
|
+
from customer_retention.core.compat import DataFrame, Timestamp, ensure_datetime_column, safe_to_datetime, to_pandas
|
|
5
5
|
from customer_retention.core.components.enums import Severity
|
|
6
6
|
|
|
7
7
|
|
|
@@ -38,6 +38,7 @@ class DuplicateEventCheck(TemporalQualityCheck):
|
|
|
38
38
|
self.time_column = time_column
|
|
39
39
|
|
|
40
40
|
def run(self, df: DataFrame) -> TemporalQualityResult:
|
|
41
|
+
df = to_pandas(df)
|
|
41
42
|
if len(df) == 0:
|
|
42
43
|
return self._pass_result("No data to check")
|
|
43
44
|
|
|
@@ -70,11 +71,12 @@ class TemporalGapCheck(TemporalQualityCheck):
|
|
|
70
71
|
self.max_gap_multiple = max_gap_multiple
|
|
71
72
|
|
|
72
73
|
def run(self, df: DataFrame) -> TemporalQualityResult:
|
|
74
|
+
df = to_pandas(df)
|
|
73
75
|
if len(df) < 2:
|
|
74
76
|
return self._pass_result("Insufficient data to check gaps")
|
|
75
77
|
|
|
76
|
-
df
|
|
77
|
-
time_col =
|
|
78
|
+
ensure_datetime_column(df, self.time_column)
|
|
79
|
+
time_col = df.sort_values(self.time_column)[self.time_column]
|
|
78
80
|
diffs_days = time_col.diff().dropna().dt.total_seconds() / 86400
|
|
79
81
|
expected_days = self.FREQ_TO_DAYS.get(self.expected_frequency, 1)
|
|
80
82
|
threshold_days = expected_days * self.max_gap_multiple
|
|
@@ -108,10 +110,11 @@ class FutureDateCheck(TemporalQualityCheck):
|
|
|
108
110
|
self.reference_date = reference_date or Timestamp.now()
|
|
109
111
|
|
|
110
112
|
def run(self, df: DataFrame) -> TemporalQualityResult:
|
|
113
|
+
df = to_pandas(df)
|
|
111
114
|
if len(df) == 0:
|
|
112
115
|
return self._pass_result("No data to check")
|
|
113
116
|
|
|
114
|
-
time_col =
|
|
117
|
+
time_col = safe_to_datetime(df[self.time_column])
|
|
115
118
|
future_mask = time_col > self.reference_date
|
|
116
119
|
future_count = future_mask.sum()
|
|
117
120
|
|
|
@@ -138,10 +141,11 @@ class EventOrderCheck(TemporalQualityCheck):
|
|
|
138
141
|
self.time_column = time_column
|
|
139
142
|
|
|
140
143
|
def run(self, df: DataFrame) -> TemporalQualityResult:
|
|
144
|
+
df = to_pandas(df)
|
|
141
145
|
if len(df) < 2:
|
|
142
146
|
return self._pass_result("Insufficient data to check ordering")
|
|
143
147
|
|
|
144
|
-
df_check = df.assign(_parsed_time=
|
|
148
|
+
df_check = df.assign(_parsed_time=safe_to_datetime(df[self.time_column]))
|
|
145
149
|
collision_counts = df_check.groupby([self.entity_column, "_parsed_time"]).size()
|
|
146
150
|
ambiguous = collision_counts[collision_counts > 1]
|
|
147
151
|
ambiguous_count = ambiguous.sum() - len(ambiguous)
|