churnkit 0.75.1a3__py3-none-any.whl → 0.76.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a1.dist-info}/METADATA +5 -2
- {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a1.dist-info}/RECORD +41 -40
- customer_retention/__init__.py +11 -1
- customer_retention/core/compat/__init__.py +3 -0
- customer_retention/core/config/__init__.py +43 -8
- customer_retention/core/config/experiments.py +20 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +222 -149
- customer_retention/integrations/adapters/factory.py +8 -5
- customer_retention/integrations/adapters/feature_store/base.py +1 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +58 -10
- customer_retention/integrations/adapters/mlflow/base.py +8 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +15 -2
- customer_retention/integrations/adapters/mlflow/local.py +7 -0
- customer_retention/integrations/databricks_init.py +141 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +3 -3
- customer_retention/stages/profiling/temporal_feature_engineer.py +2 -2
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +4 -3
- customer_retention/stages/profiling/time_series_profiler.py +5 -4
- customer_retention/stages/profiling/time_window_aggregator.py +3 -2
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +0 -0
- {churnkit-0.75.1a3.data → churnkit-0.76.0a1.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +0 -0
- {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a1.dist-info}/WHEEL +0 -0
- {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a1.dist-info}/entry_points.txt +0 -0
- {churnkit-0.75.1a3.dist-info → churnkit-0.76.0a1.dist-info}/licenses/LICENSE +0 -0
|
@@ -8,6 +8,24 @@ from ..base import AdapterResult
|
|
|
8
8
|
from .base import FeatureStoreAdapter, FeatureViewConfig
|
|
9
9
|
|
|
10
10
|
|
|
11
|
+
def _import_feature_engineering_client() -> Any:
|
|
12
|
+
try:
|
|
13
|
+
from databricks.feature_engineering import FeatureEngineeringClient
|
|
14
|
+
|
|
15
|
+
return FeatureEngineeringClient
|
|
16
|
+
except ImportError:
|
|
17
|
+
from databricks.feature_store import FeatureStoreClient
|
|
18
|
+
|
|
19
|
+
return FeatureStoreClient
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _validate_write_mode(mode: str) -> None:
|
|
23
|
+
if mode == "overwrite":
|
|
24
|
+
raise ValueError(
|
|
25
|
+
"FeatureEngineeringClient.write_table only supports mode='merge'. Use mode='merge' instead of 'overwrite'."
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
11
29
|
class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
12
30
|
def __init__(self, catalog: str = "main", schema: str = "default"):
|
|
13
31
|
if not is_spark_available():
|
|
@@ -19,27 +37,46 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
|
19
37
|
@property
|
|
20
38
|
def fe_client(self) -> Any:
|
|
21
39
|
if self._fe_client is None:
|
|
22
|
-
|
|
23
|
-
self._fe_client =
|
|
40
|
+
client_cls = _import_feature_engineering_client()
|
|
41
|
+
self._fe_client = client_cls()
|
|
24
42
|
return self._fe_client
|
|
25
43
|
|
|
26
44
|
def _full_name(self, name: str) -> str:
|
|
27
45
|
return f"{self.catalog}.{self.schema}.{name}"
|
|
28
46
|
|
|
29
|
-
def create_table(
|
|
47
|
+
def create_table(
|
|
48
|
+
self, name: str, schema: Dict[str, str], primary_keys: List[str], timeseries_column: Optional[str] = None
|
|
49
|
+
) -> AdapterResult:
|
|
30
50
|
full_name = self._full_name(name)
|
|
31
51
|
spark = get_spark_session()
|
|
32
52
|
df = spark.createDataFrame([], self._schema_to_spark(schema))
|
|
33
|
-
|
|
53
|
+
kwargs: Dict[str, Any] = {"name": full_name, "primary_keys": primary_keys, "df": df}
|
|
54
|
+
if timeseries_column:
|
|
55
|
+
kwargs["timeseries_columns"] = [timeseries_column]
|
|
56
|
+
self.fe_client.create_table(**kwargs)
|
|
34
57
|
return AdapterResult(success=True, metadata={"name": full_name})
|
|
35
58
|
|
|
36
59
|
def _schema_to_spark(self, schema: Dict[str, str]) -> Any:
|
|
37
|
-
from pyspark.sql.types import
|
|
38
|
-
|
|
60
|
+
from pyspark.sql.types import (
|
|
61
|
+
FloatType,
|
|
62
|
+
IntegerType,
|
|
63
|
+
StringType,
|
|
64
|
+
StructField,
|
|
65
|
+
StructType,
|
|
66
|
+
TimestampType,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
type_map = {
|
|
70
|
+
"int": IntegerType(),
|
|
71
|
+
"float": FloatType(),
|
|
72
|
+
"string": StringType(),
|
|
73
|
+
"timestamp": TimestampType(),
|
|
74
|
+
}
|
|
39
75
|
fields = [StructField(name, type_map.get(dtype, StringType()), True) for name, dtype in schema.items()]
|
|
40
76
|
return StructType(fields)
|
|
41
77
|
|
|
42
78
|
def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
|
|
79
|
+
_validate_write_mode(mode)
|
|
43
80
|
full_name = self._full_name(name)
|
|
44
81
|
spark = get_spark_session()
|
|
45
82
|
spark_df = spark.createDataFrame(df)
|
|
@@ -72,14 +109,22 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
|
72
109
|
table_name = self._full_name(config.name)
|
|
73
110
|
spark = get_spark_session()
|
|
74
111
|
spark_df = spark.createDataFrame(df)
|
|
75
|
-
|
|
112
|
+
kwargs: Dict[str, Any] = {"name": table_name, "primary_keys": [config.entity_key], "df": spark_df}
|
|
113
|
+
if hasattr(config, "timeseries_column") and config.timeseries_column:
|
|
114
|
+
kwargs["timeseries_columns"] = [config.timeseries_column]
|
|
115
|
+
self.fe_client.create_table(**kwargs)
|
|
76
116
|
return table_name
|
|
77
117
|
|
|
78
118
|
def get_historical_features(self, entity_df: pd.DataFrame, feature_refs: List[str]) -> pd.DataFrame:
|
|
79
119
|
from databricks.feature_engineering import FeatureLookup
|
|
120
|
+
|
|
80
121
|
spark = get_spark_session()
|
|
81
|
-
lookups = [
|
|
82
|
-
|
|
122
|
+
lookups = [
|
|
123
|
+
FeatureLookup(table_name=ref.split(":")[0], lookup_key=[entity_df.columns[0]]) for ref in feature_refs
|
|
124
|
+
]
|
|
125
|
+
training_set = self.fe_client.create_training_set(
|
|
126
|
+
df=spark.createDataFrame(entity_df), feature_lookups=lookups, label=None
|
|
127
|
+
)
|
|
83
128
|
return training_set.load_df().toPandas()
|
|
84
129
|
|
|
85
130
|
def materialize(self, feature_views: List[str], start_date: str, end_date: str) -> None:
|
|
@@ -89,6 +134,9 @@ class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
|
89
134
|
entity_df = pd.DataFrame(entity_keys)
|
|
90
135
|
spark = get_spark_session()
|
|
91
136
|
from databricks.feature_engineering import FeatureLookup
|
|
92
|
-
|
|
137
|
+
|
|
138
|
+
lookups = [
|
|
139
|
+
FeatureLookup(table_name=ref.split(":")[0], lookup_key=list(entity_keys.keys())) for ref in feature_refs
|
|
140
|
+
]
|
|
93
141
|
result = self.fe_client.score_batch(df=spark.createDataFrame(entity_df), feature_lookups=lookups)
|
|
94
142
|
return result.toPandas().to_dict()
|
|
@@ -30,3 +30,11 @@ class MLflowAdapter(ABC):
|
|
|
30
30
|
@abstractmethod
|
|
31
31
|
def transition_stage(self, model_name: str, version: str, stage: str) -> None:
|
|
32
32
|
pass
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def set_alias(self, model_name: str, alias: str, version: str) -> None:
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def get_model_by_alias(self, model_name: str, alias: str) -> Any:
|
|
40
|
+
pass
|
|
@@ -7,9 +7,12 @@ from .base import MLflowAdapter
|
|
|
7
7
|
try:
|
|
8
8
|
import mlflow
|
|
9
9
|
from mlflow.tracking import MlflowClient
|
|
10
|
+
|
|
10
11
|
MLFLOW_AVAILABLE = True
|
|
12
|
+
MLFLOW_MAJOR_VERSION = int(mlflow.__version__.split(".")[0])
|
|
11
13
|
except ImportError:
|
|
12
14
|
MLFLOW_AVAILABLE = False
|
|
15
|
+
MLFLOW_MAJOR_VERSION = 0
|
|
13
16
|
|
|
14
17
|
|
|
15
18
|
class DatabricksMLflow(MLflowAdapter):
|
|
@@ -18,7 +21,8 @@ class DatabricksMLflow(MLflowAdapter):
|
|
|
18
21
|
raise ImportError("PySpark required for DatabricksMLflow")
|
|
19
22
|
if not MLFLOW_AVAILABLE:
|
|
20
23
|
raise ImportError("mlflow package required")
|
|
21
|
-
|
|
24
|
+
if MLFLOW_MAJOR_VERSION < 3:
|
|
25
|
+
mlflow.set_registry_uri(registry_uri)
|
|
22
26
|
self.registry_uri = registry_uri
|
|
23
27
|
self._client = MlflowClient()
|
|
24
28
|
self._run_id = None
|
|
@@ -44,7 +48,10 @@ class DatabricksMLflow(MLflowAdapter):
|
|
|
44
48
|
mlflow.log_metrics(metrics)
|
|
45
49
|
|
|
46
50
|
def log_model(self, model: Any, artifact_path: str, registered_name: Optional[str] = None) -> str:
|
|
47
|
-
|
|
51
|
+
if MLFLOW_MAJOR_VERSION >= 3:
|
|
52
|
+
info = mlflow.sklearn.log_model(model, name=artifact_path, registered_model_name=registered_name)
|
|
53
|
+
else:
|
|
54
|
+
info = mlflow.sklearn.log_model(model, artifact_path, registered_model_name=registered_name)
|
|
48
55
|
return info.model_uri
|
|
49
56
|
|
|
50
57
|
def load_model(self, model_uri: str) -> Any:
|
|
@@ -52,3 +59,9 @@ class DatabricksMLflow(MLflowAdapter):
|
|
|
52
59
|
|
|
53
60
|
def transition_stage(self, model_name: str, version: str, stage: str) -> None:
|
|
54
61
|
self._client.set_model_version_tag(name=model_name, version=version, key="stage", value=stage)
|
|
62
|
+
|
|
63
|
+
def set_alias(self, model_name: str, alias: str, version: str) -> None:
|
|
64
|
+
self._client.set_registered_model_alias(name=model_name, alias=alias, version=version)
|
|
65
|
+
|
|
66
|
+
def get_model_by_alias(self, model_name: str, alias: str) -> Any:
|
|
67
|
+
return self._client.get_model_version_by_alias(name=model_name, alias=alias)
|
|
@@ -5,6 +5,7 @@ from .base import MLflowAdapter
|
|
|
5
5
|
try:
|
|
6
6
|
import mlflow
|
|
7
7
|
from mlflow.tracking import MlflowClient
|
|
8
|
+
|
|
8
9
|
MLFLOW_AVAILABLE = True
|
|
9
10
|
except ImportError:
|
|
10
11
|
MLFLOW_AVAILABLE = False
|
|
@@ -48,3 +49,9 @@ class LocalMLflow(MLflowAdapter):
|
|
|
48
49
|
|
|
49
50
|
def transition_stage(self, model_name: str, version: str, stage: str) -> None:
|
|
50
51
|
self._client.transition_model_version_stage(name=model_name, version=version, stage=stage)
|
|
52
|
+
|
|
53
|
+
def set_alias(self, model_name: str, alias: str, version: str) -> None:
|
|
54
|
+
self._client.set_registered_model_alias(name=model_name, alias=alias, version=version)
|
|
55
|
+
|
|
56
|
+
def get_model_by_alias(self, model_name: str, alias: str) -> Any:
|
|
57
|
+
return self._client.get_model_version_by_alias(name=model_name, alias=alias)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class DatabricksInitResult:
|
|
12
|
+
catalog: str
|
|
13
|
+
schema: str
|
|
14
|
+
experiment_name: str
|
|
15
|
+
workspace_path: str | None
|
|
16
|
+
model_name: str
|
|
17
|
+
notebooks_copied: list[str] = field(default_factory=list)
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def environment_variables(self) -> dict[str, str]:
|
|
21
|
+
env_vars = {
|
|
22
|
+
"CR_CATALOG": self.catalog,
|
|
23
|
+
"CR_SCHEMA": self.schema,
|
|
24
|
+
"CR_EXPERIMENT_NAME": self.experiment_name,
|
|
25
|
+
"CR_EXPERIMENTS_DIR": f"/Workspace/{self.workspace_path}/experiments" if self.workspace_path else "",
|
|
26
|
+
}
|
|
27
|
+
if self.workspace_path:
|
|
28
|
+
env_vars["CR_WORKSPACE_PATH"] = self.workspace_path
|
|
29
|
+
return env_vars
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def databricks_init(
|
|
33
|
+
catalog: str = "main",
|
|
34
|
+
schema: str = "default",
|
|
35
|
+
experiment_name: str | None = None,
|
|
36
|
+
workspace_path: str | None = None,
|
|
37
|
+
copy_notebooks: bool = True,
|
|
38
|
+
model_name: str = "customer_retention",
|
|
39
|
+
) -> DatabricksInitResult:
|
|
40
|
+
_validate_databricks_environment()
|
|
41
|
+
_set_environment_variables(catalog, schema, workspace_path)
|
|
42
|
+
resolved_experiment_name = experiment_name or _resolve_experiment_name_from_notebook_path()
|
|
43
|
+
_set_experiment_name_env_var(resolved_experiment_name)
|
|
44
|
+
_configure_mlflow_experiment(resolved_experiment_name)
|
|
45
|
+
notebooks_copied: list[str] = []
|
|
46
|
+
if copy_notebooks and workspace_path:
|
|
47
|
+
notebooks_copied = _copy_exploration_notebooks(workspace_path)
|
|
48
|
+
result = DatabricksInitResult(
|
|
49
|
+
catalog=catalog,
|
|
50
|
+
schema=schema,
|
|
51
|
+
experiment_name=resolved_experiment_name,
|
|
52
|
+
workspace_path=workspace_path,
|
|
53
|
+
model_name=model_name,
|
|
54
|
+
notebooks_copied=notebooks_copied,
|
|
55
|
+
)
|
|
56
|
+
_display_init_summary(result)
|
|
57
|
+
return result
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _validate_databricks_environment() -> None:
|
|
61
|
+
if not os.environ.get("DATABRICKS_RUNTIME_VERSION"):
|
|
62
|
+
raise RuntimeError(
|
|
63
|
+
"databricks_init() must be called from a Databricks notebook. "
|
|
64
|
+
"DATABRICKS_RUNTIME_VERSION not found in environment."
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _set_environment_variables(catalog: str, schema: str, workspace_path: str | None) -> None:
|
|
69
|
+
os.environ["CR_CATALOG"] = catalog
|
|
70
|
+
os.environ["CR_SCHEMA"] = schema
|
|
71
|
+
if workspace_path:
|
|
72
|
+
os.environ["CR_WORKSPACE_PATH"] = workspace_path
|
|
73
|
+
os.environ["CR_EXPERIMENTS_DIR"] = f"/Workspace/{workspace_path}/experiments"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _set_experiment_name_env_var(experiment_name: str) -> None:
|
|
77
|
+
os.environ["CR_EXPERIMENT_NAME"] = experiment_name
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _resolve_experiment_name_from_notebook_path() -> str:
|
|
81
|
+
try:
|
|
82
|
+
dbutils = _get_dbutils()
|
|
83
|
+
if dbutils:
|
|
84
|
+
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
|
|
85
|
+
return notebook_path.rsplit("/", 1)[-1]
|
|
86
|
+
except Exception:
|
|
87
|
+
pass
|
|
88
|
+
return "customer_retention"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _get_dbutils() -> Any | None:
|
|
92
|
+
try:
|
|
93
|
+
from customer_retention.core.compat.detection import get_dbutils
|
|
94
|
+
|
|
95
|
+
return get_dbutils()
|
|
96
|
+
except Exception:
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _configure_mlflow_experiment(experiment_name: str) -> None:
|
|
101
|
+
try:
|
|
102
|
+
import mlflow
|
|
103
|
+
|
|
104
|
+
mlflow.set_experiment(experiment_name)
|
|
105
|
+
except ImportError:
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _copy_exploration_notebooks(workspace_path: str) -> list[str]:
|
|
110
|
+
from customer_retention.generators.notebook_generator.project_init import ProjectInitializer
|
|
111
|
+
|
|
112
|
+
source_dir = ProjectInitializer(project_name="")._get_exploration_source_dir()
|
|
113
|
+
if not source_dir or not source_dir.exists():
|
|
114
|
+
return []
|
|
115
|
+
|
|
116
|
+
dest_dir = Path(f"/Workspace/{workspace_path}/exploration_notebooks")
|
|
117
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
118
|
+
|
|
119
|
+
copied = []
|
|
120
|
+
for notebook in source_dir.glob("*.ipynb"):
|
|
121
|
+
dest_path = dest_dir / notebook.name
|
|
122
|
+
if not dest_path.exists():
|
|
123
|
+
shutil.copy2(notebook, dest_path)
|
|
124
|
+
copied.append(str(dest_path))
|
|
125
|
+
|
|
126
|
+
return copied
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _display_init_summary(result: DatabricksInitResult) -> None:
|
|
130
|
+
print("ChurnKit Databricks Initialization Complete")
|
|
131
|
+
print("=" * 45)
|
|
132
|
+
print(f" Catalog: {result.catalog}")
|
|
133
|
+
print(f" Schema: {result.schema}")
|
|
134
|
+
print(f" Experiment: {result.experiment_name}")
|
|
135
|
+
print(f" Workspace Path: {result.workspace_path or '(not set)'}")
|
|
136
|
+
print(f" Model Name: {result.model_name}")
|
|
137
|
+
if result.notebooks_copied:
|
|
138
|
+
print(f" Notebooks Copied: {len(result.notebooks_copied)}")
|
|
139
|
+
for nb in result.notebooks_copied:
|
|
140
|
+
print(f" - {nb}")
|
|
141
|
+
print("=" * 45)
|
|
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
from scipy import stats
|
|
7
7
|
|
|
8
|
-
from customer_retention.core.compat import DataFrame, ensure_datetime_column, pd, qcut, to_pandas
|
|
8
|
+
from customer_retention.core.compat import DataFrame, ensure_datetime_column, native_pd, pd, qcut, to_pandas
|
|
9
9
|
from customer_retention.core.utils import compute_effect_size
|
|
10
10
|
|
|
11
11
|
|
|
@@ -642,7 +642,7 @@ class TemporalFeatureAnalyzer:
|
|
|
642
642
|
)
|
|
643
643
|
|
|
644
644
|
def _calculate_iv(self, feature: pd.Series, target: pd.Series, bins: int = 10) -> float:
|
|
645
|
-
df_iv =
|
|
645
|
+
df_iv = native_pd.DataFrame({"feature": feature, "target": target}).dropna()
|
|
646
646
|
if len(df_iv) < bins * 2:
|
|
647
647
|
return 0.0
|
|
648
648
|
try:
|
|
@@ -666,7 +666,7 @@ class TemporalFeatureAnalyzer:
|
|
|
666
666
|
return float(grouped["iv"].sum())
|
|
667
667
|
|
|
668
668
|
def _calculate_ks(self, feature: pd.Series, target: pd.Series) -> Tuple[float, float]:
|
|
669
|
-
df_ks =
|
|
669
|
+
df_ks = native_pd.DataFrame({"feature": feature, "target": target}).dropna()
|
|
670
670
|
group0, group1 = df_ks[df_ks["target"] == 0]["feature"], df_ks[df_ks["target"] == 1]["feature"]
|
|
671
671
|
if len(group0) == 0 or len(group1) == 0:
|
|
672
672
|
return 0.0, 1.0
|
|
@@ -25,7 +25,7 @@ from typing import Any, Dict, List, Optional
|
|
|
25
25
|
|
|
26
26
|
import numpy as np
|
|
27
27
|
|
|
28
|
-
from customer_retention.core.compat import Timedelta, pd, to_datetime, to_pandas
|
|
28
|
+
from customer_retention.core.compat import Timedelta, native_pd, pd, to_datetime, to_pandas
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class ReferenceMode(Enum):
|
|
@@ -307,7 +307,7 @@ class TemporalFeatureEngineer:
|
|
|
307
307
|
|
|
308
308
|
if self.config.reference_mode == ReferenceMode.GLOBAL_DATE:
|
|
309
309
|
ref_date = self.config.global_reference_date or datetime.now()
|
|
310
|
-
return
|
|
310
|
+
return native_pd.DataFrame({
|
|
311
311
|
entity_col: entities,
|
|
312
312
|
"reference_date": ref_date,
|
|
313
313
|
})
|
|
@@ -10,6 +10,7 @@ from customer_retention.core.compat import (
|
|
|
10
10
|
Timestamp,
|
|
11
11
|
cut,
|
|
12
12
|
ensure_datetime_column,
|
|
13
|
+
native_pd,
|
|
13
14
|
pd,
|
|
14
15
|
safe_to_datetime,
|
|
15
16
|
to_pandas,
|
|
@@ -316,7 +317,7 @@ def _diagnose_anomaly_pattern(
|
|
|
316
317
|
entity_first = df.groupby(entity_column)[time_column].min()
|
|
317
318
|
entity_last = df.groupby(entity_column)[time_column].max()
|
|
318
319
|
tenure = (entity_last - entity_first).dt.days
|
|
319
|
-
tenure_by_target =
|
|
320
|
+
tenure_by_target = native_pd.DataFrame({"target": entity_target, "tenure": tenure})
|
|
320
321
|
retained_tenure = tenure_by_target[tenure_by_target["target"] == 1]["tenure"]
|
|
321
322
|
churned_tenure = tenure_by_target[tenure_by_target["target"] == 0]["tenure"]
|
|
322
323
|
retained_median_tenure = float(retained_tenure.median()) if len(retained_tenure) > 0 else None
|
|
@@ -597,7 +598,7 @@ class TemporalPatternAnalyzer:
|
|
|
597
598
|
|
|
598
599
|
def analyze_cohorts(self, df: DataFrame, entity_column: str, cohort_column: str, target_column: Optional[str] = None, period: str = "M") -> DataFrame:
|
|
599
600
|
if len(df) == 0:
|
|
600
|
-
return
|
|
601
|
+
return native_pd.DataFrame()
|
|
601
602
|
|
|
602
603
|
df_copy = to_pandas(df).copy()
|
|
603
604
|
ensure_datetime_column(df_copy, cohort_column)
|
|
@@ -638,7 +639,7 @@ class TemporalPatternAnalyzer:
|
|
|
638
639
|
target_correlation = None
|
|
639
640
|
if target_column and target_column in df.columns:
|
|
640
641
|
entity_target = df.groupby(entity_column)[target_column].first()
|
|
641
|
-
combined =
|
|
642
|
+
combined = native_pd.DataFrame({"recency": recency_days, "target": entity_target}).dropna()
|
|
642
643
|
|
|
643
644
|
if len(combined) > 2:
|
|
644
645
|
corr, _ = stats.pearsonr(combined["recency"], combined["target"])
|
|
@@ -7,6 +7,7 @@ from customer_retention.core.compat import (
|
|
|
7
7
|
DataFrame,
|
|
8
8
|
Timestamp,
|
|
9
9
|
ensure_datetime_column,
|
|
10
|
+
native_pd,
|
|
10
11
|
pd,
|
|
11
12
|
to_pandas,
|
|
12
13
|
)
|
|
@@ -95,7 +96,7 @@ def classify_lifecycle_quadrants(entity_lifecycles: DataFrame) -> LifecycleQuadr
|
|
|
95
96
|
lifecycles=lc,
|
|
96
97
|
tenure_threshold=tenure_threshold,
|
|
97
98
|
intensity_threshold=intensity_threshold,
|
|
98
|
-
recommendations=
|
|
99
|
+
recommendations=native_pd.DataFrame(rows),
|
|
99
100
|
)
|
|
100
101
|
|
|
101
102
|
|
|
@@ -160,7 +161,7 @@ def classify_activity_segments(entity_lifecycles: DataFrame) -> ActivitySegmentR
|
|
|
160
161
|
lifecycles=lc,
|
|
161
162
|
q25_threshold=q25,
|
|
162
163
|
q75_threshold=q75,
|
|
163
|
-
recommendations=
|
|
164
|
+
recommendations=native_pd.DataFrame(rows),
|
|
164
165
|
)
|
|
165
166
|
|
|
166
167
|
|
|
@@ -237,7 +238,7 @@ class TimeSeriesProfiler:
|
|
|
237
238
|
def _compute_entity_lifecycles(self, df: DataFrame) -> DataFrame:
|
|
238
239
|
grouped = df.groupby(self.entity_column)[self.time_column]
|
|
239
240
|
|
|
240
|
-
lifecycles =
|
|
241
|
+
lifecycles = native_pd.DataFrame({
|
|
241
242
|
"entity": grouped.first().index.tolist(),
|
|
242
243
|
"first_event": grouped.min().values,
|
|
243
244
|
"last_event": grouped.max().values,
|
|
@@ -302,7 +303,7 @@ class TimeSeriesProfiler:
|
|
|
302
303
|
events_per_entity=DistributionStats(
|
|
303
304
|
min=0, max=0, mean=0, median=0, std=0, q25=0, q75=0
|
|
304
305
|
),
|
|
305
|
-
entity_lifecycles=
|
|
306
|
+
entity_lifecycles=native_pd.DataFrame(columns=[
|
|
306
307
|
"entity", "first_event", "last_event", "duration_days", "event_count"
|
|
307
308
|
]),
|
|
308
309
|
avg_inter_event_days=None,
|
|
@@ -12,6 +12,7 @@ from customer_retention.core.compat import (
|
|
|
12
12
|
Timestamp,
|
|
13
13
|
ensure_datetime_column,
|
|
14
14
|
is_numeric_dtype,
|
|
15
|
+
native_pd,
|
|
15
16
|
pd,
|
|
16
17
|
to_pandas,
|
|
17
18
|
)
|
|
@@ -85,7 +86,7 @@ class TimeWindowAggregator:
|
|
|
85
86
|
) -> DataFrame:
|
|
86
87
|
df = to_pandas(df)
|
|
87
88
|
if len(df) == 0:
|
|
88
|
-
return
|
|
89
|
+
return native_pd.DataFrame()
|
|
89
90
|
|
|
90
91
|
df = df.copy()
|
|
91
92
|
ensure_datetime_column(df, self.time_column)
|
|
@@ -113,7 +114,7 @@ class TimeWindowAggregator:
|
|
|
113
114
|
if include_tenure:
|
|
114
115
|
result_data["days_since_first_event"] = self._compute_tenure(df, entities, reference_date)
|
|
115
116
|
|
|
116
|
-
result =
|
|
117
|
+
result = native_pd.DataFrame(result_data)
|
|
117
118
|
result.attrs["aggregation_reference_date"] = (
|
|
118
119
|
reference_date.isoformat() if hasattr(reference_date, "isoformat") else str(reference_date))
|
|
119
120
|
result.attrs["aggregation_timestamp"] = Timestamp.now().isoformat()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|