churnkit 0.75.1a3__py3-none-any.whl → 0.76.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a2.dist-info}/METADATA +5 -2
- {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a2.dist-info}/RECORD +41 -40
- customer_retention/__init__.py +11 -1
- customer_retention/core/compat/__init__.py +3 -0
- customer_retention/core/config/__init__.py +43 -8
- customer_retention/core/config/experiments.py +20 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +222 -149
- customer_retention/integrations/adapters/factory.py +8 -5
- customer_retention/integrations/adapters/feature_store/base.py +1 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +58 -10
- customer_retention/integrations/adapters/mlflow/base.py +8 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +15 -2
- customer_retention/integrations/adapters/mlflow/local.py +7 -0
- customer_retention/integrations/databricks_init.py +153 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +3 -3
- customer_retention/stages/profiling/temporal_feature_engineer.py +2 -2
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +4 -4
- customer_retention/stages/profiling/time_series_profiler.py +5 -5
- customer_retention/stages/profiling/time_window_aggregator.py +3 -3
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +0 -0
- {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a2.dist-info}/WHEEL +0 -0
- {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a2.dist-info}/entry_points.txt +0 -0
- {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -8,6 +8,24 @@ from ..base import AdapterResult
|
|
|
8
8
|
from .base import FeatureStoreAdapter, FeatureViewConfig
|
|
9
9
|
|
|
10
10
|
|
|
11
|
+
def _import_feature_engineering_client() -> Any:
|
|
12
|
+
try:
|
|
13
|
+
from databricks.feature_engineering import FeatureEngineeringClient
|
|
14
|
+
|
|
15
|
+
return FeatureEngineeringClient
|
|
16
|
+
except ImportError:
|
|
17
|
+
from databricks.feature_store import FeatureStoreClient
|
|
18
|
+
|
|
19
|
+
return FeatureStoreClient
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _validate_write_mode(mode: str) -> None:
|
|
23
|
+
if mode == "overwrite":
|
|
24
|
+
raise ValueError(
|
|
25
|
+
"FeatureEngineeringClient.write_table only supports mode='merge'. Use mode='merge' instead of 'overwrite'."
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
11
29
|
class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
12
30
|
def __init__(self, catalog: str = "main", schema: str = "default"):
|
|
13
31
|
if not is_spark_available():
|
|
@@ -19,27 +37,46 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
|
19
37
|
@property
|
|
20
38
|
def fe_client(self) -> Any:
|
|
21
39
|
if self._fe_client is None:
|
|
22
|
-
|
|
23
|
-
self._fe_client =
|
|
40
|
+
client_cls = _import_feature_engineering_client()
|
|
41
|
+
self._fe_client = client_cls()
|
|
24
42
|
return self._fe_client
|
|
25
43
|
|
|
26
44
|
def _full_name(self, name: str) -> str:
|
|
27
45
|
return f"{self.catalog}.{self.schema}.{name}"
|
|
28
46
|
|
|
29
|
-
def create_table(
|
|
47
|
+
def create_table(
|
|
48
|
+
self, name: str, schema: Dict[str, str], primary_keys: List[str], timeseries_column: Optional[str] = None
|
|
49
|
+
) -> AdapterResult:
|
|
30
50
|
full_name = self._full_name(name)
|
|
31
51
|
spark = get_spark_session()
|
|
32
52
|
df = spark.createDataFrame([], self._schema_to_spark(schema))
|
|
33
|
-
|
|
53
|
+
kwargs: Dict[str, Any] = {"name": full_name, "primary_keys": primary_keys, "df": df}
|
|
54
|
+
if timeseries_column:
|
|
55
|
+
kwargs["timeseries_columns"] = [timeseries_column]
|
|
56
|
+
self.fe_client.create_table(**kwargs)
|
|
34
57
|
return AdapterResult(success=True, metadata={"name": full_name})
|
|
35
58
|
|
|
36
59
|
def _schema_to_spark(self, schema: Dict[str, str]) -> Any:
|
|
37
|
-
from pyspark.sql.types import
|
|
38
|
-
|
|
60
|
+
from pyspark.sql.types import (
|
|
61
|
+
FloatType,
|
|
62
|
+
IntegerType,
|
|
63
|
+
StringType,
|
|
64
|
+
StructField,
|
|
65
|
+
StructType,
|
|
66
|
+
TimestampType,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
type_map = {
|
|
70
|
+
"int": IntegerType(),
|
|
71
|
+
"float": FloatType(),
|
|
72
|
+
"string": StringType(),
|
|
73
|
+
"timestamp": TimestampType(),
|
|
74
|
+
}
|
|
39
75
|
fields = [StructField(name, type_map.get(dtype, StringType()), True) for name, dtype in schema.items()]
|
|
40
76
|
return StructType(fields)
|
|
41
77
|
|
|
42
78
|
def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
|
|
79
|
+
_validate_write_mode(mode)
|
|
43
80
|
full_name = self._full_name(name)
|
|
44
81
|
spark = get_spark_session()
|
|
45
82
|
spark_df = spark.createDataFrame(df)
|
|
@@ -72,14 +109,22 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
|
72
109
|
table_name = self._full_name(config.name)
|
|
73
110
|
spark = get_spark_session()
|
|
74
111
|
spark_df = spark.createDataFrame(df)
|
|
75
|
-
|
|
112
|
+
kwargs: Dict[str, Any] = {"name": table_name, "primary_keys": [config.entity_key], "df": spark_df}
|
|
113
|
+
if hasattr(config, "timeseries_column") and config.timeseries_column:
|
|
114
|
+
kwargs["timeseries_columns"] = [config.timeseries_column]
|
|
115
|
+
self.fe_client.create_table(**kwargs)
|
|
76
116
|
return table_name
|
|
77
117
|
|
|
78
118
|
def get_historical_features(self, entity_df: pd.DataFrame, feature_refs: List[str]) -> pd.DataFrame:
|
|
79
119
|
from databricks.feature_engineering import FeatureLookup
|
|
120
|
+
|
|
80
121
|
spark = get_spark_session()
|
|
81
|
-
lookups = [
|
|
82
|
-
|
|
122
|
+
lookups = [
|
|
123
|
+
FeatureLookup(table_name=ref.split(":")[0], lookup_key=[entity_df.columns[0]]) for ref in feature_refs
|
|
124
|
+
]
|
|
125
|
+
training_set = self.fe_client.create_training_set(
|
|
126
|
+
df=spark.createDataFrame(entity_df), feature_lookups=lookups, label=None
|
|
127
|
+
)
|
|
83
128
|
return training_set.load_df().toPandas()
|
|
84
129
|
|
|
85
130
|
def materialize(self, feature_views: List[str], start_date: str, end_date: str) -> None:
|
|
@@ -89,6 +134,9 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
|
89
134
|
entity_df = pd.DataFrame(entity_keys)
|
|
90
135
|
spark = get_spark_session()
|
|
91
136
|
from databricks.feature_engineering import FeatureLookup
|
|
92
|
-
|
|
137
|
+
|
|
138
|
+
lookups = [
|
|
139
|
+
FeatureLookup(table_name=ref.split(":")[0], lookup_key=list(entity_keys.keys())) for ref in feature_refs
|
|
140
|
+
]
|
|
93
141
|
result = self.fe_client.score_batch(df=spark.createDataFrame(entity_df), feature_lookups=lookups)
|
|
94
142
|
return result.toPandas().to_dict()
|
|
@@ -30,3 +30,11 @@ class MLflowAdapter(ABC):
|
|
|
30
30
|
@abstractmethod
|
|
31
31
|
def transition_stage(self, model_name: str, version: str, stage: str) -> None:
|
|
32
32
|
pass
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def set_alias(self, model_name: str, alias: str, version: str) -> None:
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def get_model_by_alias(self, model_name: str, alias: str) -> Any:
|
|
40
|
+
pass
|
|
@@ -7,9 +7,12 @@ from .base import MLflowAdapter
|
|
|
7
7
|
try:
|
|
8
8
|
import mlflow
|
|
9
9
|
from mlflow.tracking import MlflowClient
|
|
10
|
+
|
|
10
11
|
MLFLOW_AVAILABLE = True
|
|
12
|
+
MLFLOW_MAJOR_VERSION = int(mlflow.__version__.split(".")[0])
|
|
11
13
|
except ImportError:
|
|
12
14
|
MLFLOW_AVAILABLE = False
|
|
15
|
+
MLFLOW_MAJOR_VERSION = 0
|
|
13
16
|
|
|
14
17
|
|
|
15
18
|
class DatabricksMLflow(MLflowAdapter):
|
|
@@ -18,7 +21,8 @@ class DatabricksMLflow(MLflowAdapter):
|
|
|
18
21
|
raise ImportError("PySpark required for DatabricksMLflow")
|
|
19
22
|
if not MLFLOW_AVAILABLE:
|
|
20
23
|
raise ImportError("mlflow package required")
|
|
21
|
-
|
|
24
|
+
if MLFLOW_MAJOR_VERSION < 3:
|
|
25
|
+
mlflow.set_registry_uri(registry_uri)
|
|
22
26
|
self.registry_uri = registry_uri
|
|
23
27
|
self._client = MlflowClient()
|
|
24
28
|
self._run_id = None
|
|
@@ -44,7 +48,10 @@ class DatabricksMLflow(MLflowAdapter):
|
|
|
44
48
|
mlflow.log_metrics(metrics)
|
|
45
49
|
|
|
46
50
|
def log_model(self, model: Any, artifact_path: str, registered_name: Optional[str] = None) -> str:
|
|
47
|
-
|
|
51
|
+
if MLFLOW_MAJOR_VERSION >= 3:
|
|
52
|
+
info = mlflow.sklearn.log_model(model, name=artifact_path, registered_model_name=registered_name)
|
|
53
|
+
else:
|
|
54
|
+
info = mlflow.sklearn.log_model(model, artifact_path, registered_model_name=registered_name)
|
|
48
55
|
return info.model_uri
|
|
49
56
|
|
|
50
57
|
def load_model(self, model_uri: str) -> Any:
|
|
@@ -52,3 +59,9 @@ class DatabricksMLflow(MLflowAdapter):
|
|
|
52
59
|
|
|
53
60
|
def transition_stage(self, model_name: str, version: str, stage: str) -> None:
|
|
54
61
|
self._client.set_model_version_tag(name=model_name, version=version, key="stage", value=stage)
|
|
62
|
+
|
|
63
|
+
def set_alias(self, model_name: str, alias: str, version: str) -> None:
|
|
64
|
+
self._client.set_registered_model_alias(name=model_name, alias=alias, version=version)
|
|
65
|
+
|
|
66
|
+
def get_model_by_alias(self, model_name: str, alias: str) -> Any:
|
|
67
|
+
return self._client.get_model_version_by_alias(name=model_name, alias=alias)
|
|
@@ -5,6 +5,7 @@ from .base import MLflowAdapter
|
|
|
5
5
|
try:
|
|
6
6
|
import mlflow
|
|
7
7
|
from mlflow.tracking import MlflowClient
|
|
8
|
+
|
|
8
9
|
MLFLOW_AVAILABLE = True
|
|
9
10
|
except ImportError:
|
|
10
11
|
MLFLOW_AVAILABLE = False
|
|
@@ -48,3 +49,9 @@ class LocalMLflow(MLflowAdapter):
|
|
|
48
49
|
|
|
49
50
|
def transition_stage(self, model_name: str, version: str, stage: str) -> None:
|
|
50
51
|
self._client.transition_model_version_stage(name=model_name, version=version, stage=stage)
|
|
52
|
+
|
|
53
|
+
def set_alias(self, model_name: str, alias: str, version: str) -> None:
|
|
54
|
+
self._client.set_registered_model_alias(name=model_name, alias=alias, version=version)
|
|
55
|
+
|
|
56
|
+
def get_model_by_alias(self, model_name: str, alias: str) -> Any:
|
|
57
|
+
return self._client.get_model_version_by_alias(name=model_name, alias=alias)
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class DatabricksInitResult:
|
|
12
|
+
catalog: str
|
|
13
|
+
schema: str
|
|
14
|
+
experiment_name: str
|
|
15
|
+
workspace_path: str | None
|
|
16
|
+
model_name: str
|
|
17
|
+
notebooks_copied: list[str] = field(default_factory=list)
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def environment_variables(self) -> dict[str, str]:
|
|
21
|
+
env_vars = {
|
|
22
|
+
"CR_CATALOG": self.catalog,
|
|
23
|
+
"CR_SCHEMA": self.schema,
|
|
24
|
+
"CR_EXPERIMENT_NAME": self.experiment_name,
|
|
25
|
+
"CR_EXPERIMENTS_DIR": f"/Workspace/{self.workspace_path}/experiments" if self.workspace_path else "",
|
|
26
|
+
}
|
|
27
|
+
if self.workspace_path:
|
|
28
|
+
env_vars["CR_WORKSPACE_PATH"] = self.workspace_path
|
|
29
|
+
return env_vars
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def databricks_init(
|
|
33
|
+
catalog: str = "main",
|
|
34
|
+
schema: str = "default",
|
|
35
|
+
experiment_name: str | None = None,
|
|
36
|
+
workspace_path: str | None = None,
|
|
37
|
+
copy_notebooks: bool = True,
|
|
38
|
+
model_name: str = "customer_retention",
|
|
39
|
+
) -> DatabricksInitResult:
|
|
40
|
+
_validate_databricks_environment()
|
|
41
|
+
_set_environment_variables(catalog, schema, workspace_path)
|
|
42
|
+
resolved_experiment_name = experiment_name or _resolve_experiment_name_from_notebook_path()
|
|
43
|
+
resolved_experiment_name = _make_absolute_experiment_path(resolved_experiment_name, workspace_path)
|
|
44
|
+
_set_experiment_name_env_var(resolved_experiment_name)
|
|
45
|
+
_configure_mlflow_experiment(resolved_experiment_name)
|
|
46
|
+
notebooks_copied: list[str] = []
|
|
47
|
+
if copy_notebooks and workspace_path:
|
|
48
|
+
notebooks_copied = _copy_exploration_notebooks(workspace_path)
|
|
49
|
+
result = DatabricksInitResult(
|
|
50
|
+
catalog=catalog,
|
|
51
|
+
schema=schema,
|
|
52
|
+
experiment_name=resolved_experiment_name,
|
|
53
|
+
workspace_path=workspace_path,
|
|
54
|
+
model_name=model_name,
|
|
55
|
+
notebooks_copied=notebooks_copied,
|
|
56
|
+
)
|
|
57
|
+
_display_init_summary(result)
|
|
58
|
+
return result
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _validate_databricks_environment() -> None:
|
|
62
|
+
if not os.environ.get("DATABRICKS_RUNTIME_VERSION"):
|
|
63
|
+
raise RuntimeError(
|
|
64
|
+
"databricks_init() must be called from a Databricks notebook. "
|
|
65
|
+
"DATABRICKS_RUNTIME_VERSION not found in environment."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _set_environment_variables(catalog: str, schema: str, workspace_path: str | None) -> None:
|
|
70
|
+
os.environ["CR_CATALOG"] = catalog
|
|
71
|
+
os.environ["CR_SCHEMA"] = schema
|
|
72
|
+
if workspace_path:
|
|
73
|
+
os.environ["CR_WORKSPACE_PATH"] = workspace_path
|
|
74
|
+
os.environ["CR_EXPERIMENTS_DIR"] = f"/Workspace/{workspace_path}/experiments"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _set_experiment_name_env_var(experiment_name: str) -> None:
|
|
78
|
+
os.environ["CR_EXPERIMENT_NAME"] = experiment_name
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _resolve_experiment_name_from_notebook_path() -> str:
|
|
82
|
+
try:
|
|
83
|
+
dbutils = _get_dbutils()
|
|
84
|
+
if dbutils:
|
|
85
|
+
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
|
|
86
|
+
return notebook_path.rsplit("/", 1)[-1]
|
|
87
|
+
except Exception:
|
|
88
|
+
pass
|
|
89
|
+
return "customer_retention"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _get_dbutils() -> Any | None:
|
|
93
|
+
try:
|
|
94
|
+
from customer_retention.core.compat.detection import get_dbutils
|
|
95
|
+
|
|
96
|
+
return get_dbutils()
|
|
97
|
+
except Exception:
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _make_absolute_experiment_path(experiment_name: str, workspace_path: str | None) -> str:
|
|
102
|
+
if experiment_name.startswith("/"):
|
|
103
|
+
return experiment_name
|
|
104
|
+
if not workspace_path:
|
|
105
|
+
return experiment_name
|
|
106
|
+
base = workspace_path.removeprefix("/Workspace")
|
|
107
|
+
if not base.startswith("/"):
|
|
108
|
+
base = f"/{base}"
|
|
109
|
+
return f"{base}/{experiment_name}"
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _configure_mlflow_experiment(experiment_name: str) -> None:
|
|
113
|
+
try:
|
|
114
|
+
import mlflow
|
|
115
|
+
|
|
116
|
+
mlflow.set_experiment(experiment_name)
|
|
117
|
+
except ImportError:
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _copy_exploration_notebooks(workspace_path: str) -> list[str]:
|
|
122
|
+
from customer_retention.generators.notebook_generator.project_init import ProjectInitializer
|
|
123
|
+
|
|
124
|
+
source_dir = ProjectInitializer(project_name="")._get_exploration_source_dir()
|
|
125
|
+
if not source_dir or not source_dir.exists():
|
|
126
|
+
return []
|
|
127
|
+
|
|
128
|
+
dest_dir = Path(f"/Workspace/{workspace_path}/exploration_notebooks")
|
|
129
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
130
|
+
|
|
131
|
+
copied = []
|
|
132
|
+
for notebook in source_dir.glob("*.ipynb"):
|
|
133
|
+
dest_path = dest_dir / notebook.name
|
|
134
|
+
if not dest_path.exists():
|
|
135
|
+
shutil.copy2(notebook, dest_path)
|
|
136
|
+
copied.append(str(dest_path))
|
|
137
|
+
|
|
138
|
+
return copied
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _display_init_summary(result: DatabricksInitResult) -> None:
|
|
142
|
+
print("ChurnKit Databricks Initialization Complete")
|
|
143
|
+
print("=" * 45)
|
|
144
|
+
print(f" Catalog: {result.catalog}")
|
|
145
|
+
print(f" Schema: {result.schema}")
|
|
146
|
+
print(f" Experiment: {result.experiment_name}")
|
|
147
|
+
print(f" Workspace Path: {result.workspace_path or '(not set)'}")
|
|
148
|
+
print(f" Model Name: {result.model_name}")
|
|
149
|
+
if result.notebooks_copied:
|
|
150
|
+
print(f" Notebooks Copied: {len(result.notebooks_copied)}")
|
|
151
|
+
for nb in result.notebooks_copied:
|
|
152
|
+
print(f" - {nb}")
|
|
153
|
+
print("=" * 45)
|
|
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
from scipy import stats
|
|
7
7
|
|
|
8
|
-
from customer_retention.core.compat import DataFrame, ensure_datetime_column, pd, qcut, to_pandas
|
|
8
|
+
from customer_retention.core.compat import DataFrame, ensure_datetime_column, native_pd, pd, qcut, to_pandas
|
|
9
9
|
from customer_retention.core.utils import compute_effect_size
|
|
10
10
|
|
|
11
11
|
|
|
@@ -642,7 +642,7 @@ class TemporalFeatureAnalyzer:
|
|
|
642
642
|
)
|
|
643
643
|
|
|
644
644
|
def _calculate_iv(self, feature: pd.Series, target: pd.Series, bins: int = 10) -> float:
|
|
645
|
-
df_iv =
|
|
645
|
+
df_iv = native_pd.DataFrame({"feature": feature, "target": target}).dropna()
|
|
646
646
|
if len(df_iv) < bins * 2:
|
|
647
647
|
return 0.0
|
|
648
648
|
try:
|
|
@@ -666,7 +666,7 @@ class TemporalFeatureAnalyzer:
|
|
|
666
666
|
return float(grouped["iv"].sum())
|
|
667
667
|
|
|
668
668
|
def _calculate_ks(self, feature: pd.Series, target: pd.Series) -> Tuple[float, float]:
|
|
669
|
-
df_ks =
|
|
669
|
+
df_ks = native_pd.DataFrame({"feature": feature, "target": target}).dropna()
|
|
670
670
|
group0, group1 = df_ks[df_ks["target"] == 0]["feature"], df_ks[df_ks["target"] == 1]["feature"]
|
|
671
671
|
if len(group0) == 0 or len(group1) == 0:
|
|
672
672
|
return 0.0, 1.0
|
|
@@ -25,7 +25,7 @@ from typing import Any, Dict, List, Optional
|
|
|
25
25
|
|
|
26
26
|
import numpy as np
|
|
27
27
|
|
|
28
|
-
from customer_retention.core.compat import Timedelta, pd, to_datetime, to_pandas
|
|
28
|
+
from customer_retention.core.compat import Timedelta, native_pd, pd, to_datetime, to_pandas
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class ReferenceMode(Enum):
|
|
@@ -307,7 +307,7 @@ class TemporalFeatureEngineer:
|
|
|
307
307
|
|
|
308
308
|
if self.config.reference_mode == ReferenceMode.GLOBAL_DATE:
|
|
309
309
|
ref_date = self.config.global_reference_date or datetime.now()
|
|
310
|
-
return
|
|
310
|
+
return native_pd.DataFrame({
|
|
311
311
|
entity_col: entities,
|
|
312
312
|
"reference_date": ref_date,
|
|
313
313
|
})
|
|
@@ -10,7 +10,7 @@ from customer_retention.core.compat import (
|
|
|
10
10
|
Timestamp,
|
|
11
11
|
cut,
|
|
12
12
|
ensure_datetime_column,
|
|
13
|
-
|
|
13
|
+
native_pd,
|
|
14
14
|
safe_to_datetime,
|
|
15
15
|
to_pandas,
|
|
16
16
|
)
|
|
@@ -316,7 +316,7 @@ def _diagnose_anomaly_pattern(
|
|
|
316
316
|
entity_first = df.groupby(entity_column)[time_column].min()
|
|
317
317
|
entity_last = df.groupby(entity_column)[time_column].max()
|
|
318
318
|
tenure = (entity_last - entity_first).dt.days
|
|
319
|
-
tenure_by_target =
|
|
319
|
+
tenure_by_target = native_pd.DataFrame({"target": entity_target, "tenure": tenure})
|
|
320
320
|
retained_tenure = tenure_by_target[tenure_by_target["target"] == 1]["tenure"]
|
|
321
321
|
churned_tenure = tenure_by_target[tenure_by_target["target"] == 0]["tenure"]
|
|
322
322
|
retained_median_tenure = float(retained_tenure.median()) if len(retained_tenure) > 0 else None
|
|
@@ -597,7 +597,7 @@ class TemporalPatternAnalyzer:
|
|
|
597
597
|
|
|
598
598
|
def analyze_cohorts(self, df: DataFrame, entity_column: str, cohort_column: str, target_column: Optional[str] = None, period: str = "M") -> DataFrame:
|
|
599
599
|
if len(df) == 0:
|
|
600
|
-
return
|
|
600
|
+
return native_pd.DataFrame()
|
|
601
601
|
|
|
602
602
|
df_copy = to_pandas(df).copy()
|
|
603
603
|
ensure_datetime_column(df_copy, cohort_column)
|
|
@@ -638,7 +638,7 @@ class TemporalPatternAnalyzer:
|
|
|
638
638
|
target_correlation = None
|
|
639
639
|
if target_column and target_column in df.columns:
|
|
640
640
|
entity_target = df.groupby(entity_column)[target_column].first()
|
|
641
|
-
combined =
|
|
641
|
+
combined = native_pd.DataFrame({"recency": recency_days, "target": entity_target}).dropna()
|
|
642
642
|
|
|
643
643
|
if len(combined) > 2:
|
|
644
644
|
corr, _ = stats.pearsonr(combined["recency"], combined["target"])
|
|
@@ -7,7 +7,7 @@ from customer_retention.core.compat import (
|
|
|
7
7
|
DataFrame,
|
|
8
8
|
Timestamp,
|
|
9
9
|
ensure_datetime_column,
|
|
10
|
-
|
|
10
|
+
native_pd,
|
|
11
11
|
to_pandas,
|
|
12
12
|
)
|
|
13
13
|
|
|
@@ -95,7 +95,7 @@ def classify_lifecycle_quadrants(entity_lifecycles: DataFrame) -> LifecycleQuadr
|
|
|
95
95
|
lifecycles=lc,
|
|
96
96
|
tenure_threshold=tenure_threshold,
|
|
97
97
|
intensity_threshold=intensity_threshold,
|
|
98
|
-
recommendations=
|
|
98
|
+
recommendations=native_pd.DataFrame(rows),
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
|
|
@@ -160,7 +160,7 @@ def classify_activity_segments(entity_lifecycles: DataFrame) -> ActivitySegmentR
|
|
|
160
160
|
lifecycles=lc,
|
|
161
161
|
q25_threshold=q25,
|
|
162
162
|
q75_threshold=q75,
|
|
163
|
-
recommendations=
|
|
163
|
+
recommendations=native_pd.DataFrame(rows),
|
|
164
164
|
)
|
|
165
165
|
|
|
166
166
|
|
|
@@ -237,7 +237,7 @@ class TimeSeriesProfiler:
|
|
|
237
237
|
def _compute_entity_lifecycles(self, df: DataFrame) -> DataFrame:
|
|
238
238
|
grouped = df.groupby(self.entity_column)[self.time_column]
|
|
239
239
|
|
|
240
|
-
lifecycles =
|
|
240
|
+
lifecycles = native_pd.DataFrame({
|
|
241
241
|
"entity": grouped.first().index.tolist(),
|
|
242
242
|
"first_event": grouped.min().values,
|
|
243
243
|
"last_event": grouped.max().values,
|
|
@@ -302,7 +302,7 @@ class TimeSeriesProfiler:
|
|
|
302
302
|
events_per_entity=DistributionStats(
|
|
303
303
|
min=0, max=0, mean=0, median=0, std=0, q25=0, q75=0
|
|
304
304
|
),
|
|
305
|
-
entity_lifecycles=
|
|
305
|
+
entity_lifecycles=native_pd.DataFrame(columns=[
|
|
306
306
|
"entity", "first_event", "last_event", "duration_days", "event_count"
|
|
307
307
|
]),
|
|
308
308
|
avg_inter_event_days=None,
|
|
@@ -12,7 +12,7 @@ from customer_retention.core.compat import (
|
|
|
12
12
|
Timestamp,
|
|
13
13
|
ensure_datetime_column,
|
|
14
14
|
is_numeric_dtype,
|
|
15
|
-
|
|
15
|
+
native_pd,
|
|
16
16
|
to_pandas,
|
|
17
17
|
)
|
|
18
18
|
|
|
@@ -85,7 +85,7 @@ class TimeWindowAggregator:
|
|
|
85
85
|
) -> DataFrame:
|
|
86
86
|
df = to_pandas(df)
|
|
87
87
|
if len(df) == 0:
|
|
88
|
-
return
|
|
88
|
+
return native_pd.DataFrame()
|
|
89
89
|
|
|
90
90
|
df = df.copy()
|
|
91
91
|
ensure_datetime_column(df, self.time_column)
|
|
@@ -113,7 +113,7 @@ class TimeWindowAggregator:
|
|
|
113
113
|
if include_tenure:
|
|
114
114
|
result_data["days_since_first_event"] = self._compute_tenure(df, entities, reference_date)
|
|
115
115
|
|
|
116
|
-
result =
|
|
116
|
+
result = native_pd.DataFrame(result_data)
|
|
117
117
|
result.attrs["aggregation_reference_date"] = (
|
|
118
118
|
reference_date.isoformat() if hasattr(reference_date, "isoformat") else str(reference_date))
|
|
119
119
|
result.attrs["aggregation_timestamp"] = Timestamp.now().isoformat()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|