churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,6 @@
1
+ from .base import MLflowAdapter
2
+ from .databricks import DatabricksMLflow
3
+ from .experiment_tracker import ExperimentTracker
4
+ from .local import LocalMLflow
5
+
6
+ __all__ = ["MLflowAdapter", "LocalMLflow", "DatabricksMLflow", "ExperimentTracker"]
@@ -0,0 +1,32 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Dict, Optional
3
+
4
+
5
+ class MLflowAdapter(ABC):
6
+ @abstractmethod
7
+ def start_run(self, experiment_name: str, run_name: Optional[str] = None) -> str:
8
+ pass
9
+
10
+ @abstractmethod
11
+ def end_run(self) -> None:
12
+ pass
13
+
14
+ @abstractmethod
15
+ def log_params(self, params: Dict[str, Any]) -> None:
16
+ pass
17
+
18
+ @abstractmethod
19
+ def log_metrics(self, metrics: Dict[str, float]) -> None:
20
+ pass
21
+
22
+ @abstractmethod
23
+ def log_model(self, model: Any, artifact_path: str, registered_name: Optional[str] = None) -> str:
24
+ pass
25
+
26
+ @abstractmethod
27
+ def load_model(self, model_uri: str) -> Any:
28
+ pass
29
+
30
+ @abstractmethod
31
+ def transition_stage(self, model_name: str, version: str, stage: str) -> None:
32
+ pass
@@ -0,0 +1,54 @@
1
+ from typing import Any, Dict, Optional
2
+
3
+ from customer_retention.core.compat.detection import is_spark_available
4
+
5
+ from .base import MLflowAdapter
6
+
7
+ try:
8
+ import mlflow
9
+ from mlflow.tracking import MlflowClient
10
+ MLFLOW_AVAILABLE = True
11
+ except ImportError:
12
+ MLFLOW_AVAILABLE = False
13
+
14
+
15
+ class DatabricksMLflow(MLflowAdapter):
16
+ def __init__(self, registry_uri: str = "databricks-uc"):
17
+ if not is_spark_available():
18
+ raise ImportError("PySpark required for DatabricksMLflow")
19
+ if not MLFLOW_AVAILABLE:
20
+ raise ImportError("mlflow package required")
21
+ mlflow.set_registry_uri(registry_uri)
22
+ self.registry_uri = registry_uri
23
+ self._client = MlflowClient()
24
+ self._run_id = None
25
+
26
+ def start_run(self, experiment_name: str, run_name: Optional[str] = None) -> str:
27
+ experiment = mlflow.get_experiment_by_name(experiment_name)
28
+ if experiment is None:
29
+ experiment_id = mlflow.create_experiment(experiment_name)
30
+ else:
31
+ experiment_id = experiment.experiment_id
32
+ run = mlflow.start_run(experiment_id=experiment_id, run_name=run_name)
33
+ self._run_id = run.info.run_id
34
+ return self._run_id
35
+
36
+ def end_run(self) -> None:
37
+ mlflow.end_run()
38
+ self._run_id = None
39
+
40
+ def log_params(self, params: Dict[str, Any]) -> None:
41
+ mlflow.log_params(params)
42
+
43
+ def log_metrics(self, metrics: Dict[str, float]) -> None:
44
+ mlflow.log_metrics(metrics)
45
+
46
+ def log_model(self, model: Any, artifact_path: str, registered_name: Optional[str] = None) -> str:
47
+ info = mlflow.sklearn.log_model(model, artifact_path, registered_model_name=registered_name)
48
+ return info.model_uri
49
+
50
+ def load_model(self, model_uri: str) -> Any:
51
+ return mlflow.sklearn.load_model(model_uri)
52
+
53
+ def transition_stage(self, model_name: str, version: str, stage: str) -> None:
54
+ self._client.set_model_version_tag(name=model_name, version=version, key="stage", value=stage)
@@ -0,0 +1,161 @@
1
+ from pathlib import Path
2
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
3
+
4
+ try:
5
+ import mlflow
6
+ from mlflow.tracking import MlflowClient
7
+ MLFLOW_AVAILABLE = True
8
+ except ImportError:
9
+ MLFLOW_AVAILABLE = False
10
+
11
+ if TYPE_CHECKING:
12
+ from customer_retention.analysis.auto_explorer.findings import ExplorationFindings
13
+ from customer_retention.analysis.recommendations.pipeline import RecommendationPipeline
14
+
15
+
16
+ class ExperimentTracker:
17
+ def __init__(self, tracking_uri: str = "./mlruns", experiment_name: str = "customer_retention"):
18
+ if not MLFLOW_AVAILABLE:
19
+ raise ImportError("mlflow package required. Install with: uv sync --extra ml")
20
+ mlflow.set_tracking_uri(tracking_uri)
21
+ self.tracking_uri = tracking_uri
22
+ self.experiment_name = experiment_name
23
+ self._client = MlflowClient(tracking_uri=tracking_uri)
24
+ self._ensure_experiment()
25
+
26
+ def log_exploration(self, findings: "ExplorationFindings", run_name: Optional[str] = None) -> str:
27
+ with mlflow.start_run(
28
+ run_name=run_name or f"exploration_{Path(findings.source_path).stem}",
29
+ experiment_id=self._ensure_experiment()
30
+ ) as run:
31
+ self._log_exploration_params(findings)
32
+ self._log_exploration_metrics(findings)
33
+ self._log_column_metrics(findings)
34
+ mlflow.log_dict(findings.to_dict(), "exploration_findings.json")
35
+ self._set_exploration_tags(findings)
36
+ return run.info.run_id
37
+
38
+ def log_pipeline_execution(
39
+ self, pipeline: "RecommendationPipeline", run_name: Optional[str] = None,
40
+ parent_run_id: Optional[str] = None
41
+ ) -> str:
42
+ with mlflow.start_run(
43
+ run_name=run_name or "recommendation_pipeline",
44
+ experiment_id=self._ensure_experiment(),
45
+ nested=parent_run_id is not None
46
+ ) as run:
47
+ self._log_pipeline_params(pipeline)
48
+ self._log_pipeline_metrics(pipeline)
49
+ self._log_pipeline_artifacts(pipeline)
50
+ mlflow.set_tags({"stage": "transformation", "pipeline_fitted": str(pipeline._is_fitted)})
51
+ return run.info.run_id
52
+
53
+ def log_model_training(
54
+ self, model: Any, metrics: Dict[str, float], params: Dict[str, Any],
55
+ model_name: str = "churn_model", run_name: Optional[str] = None
56
+ ) -> str:
57
+ with mlflow.start_run(
58
+ run_name=run_name or f"training_{model_name}",
59
+ experiment_id=self._ensure_experiment()
60
+ ) as run:
61
+ mlflow.log_params(params)
62
+ mlflow.log_metrics(metrics)
63
+ mlflow.sklearn.log_model(model, "model", registered_model_name=model_name)
64
+ mlflow.set_tags({"stage": "training", "model_name": model_name})
65
+ return run.info.run_id
66
+
67
+ def get_best_run(self, metric: str = "overall_quality_score", ascending: bool = False) -> Optional[Dict]:
68
+ experiment = mlflow.get_experiment_by_name(self.experiment_name)
69
+ if experiment is None:
70
+ return None
71
+ runs = self._client.search_runs(
72
+ experiment_ids=[experiment.experiment_id],
73
+ order_by=[f"metrics.{metric} {'ASC' if ascending else 'DESC'}"],
74
+ max_results=1
75
+ )
76
+ return runs[0].to_dictionary() if runs else None
77
+
78
+ def list_exploration_runs(self) -> List[Dict]:
79
+ experiment = mlflow.get_experiment_by_name(self.experiment_name)
80
+ if experiment is None:
81
+ return []
82
+ runs = self._client.search_runs(
83
+ experiment_ids=[experiment.experiment_id],
84
+ filter_string="tags.stage = 'exploration'"
85
+ )
86
+ return [r.to_dictionary() for r in runs]
87
+
88
+ @staticmethod
89
+ def serve_ui(host: str = "127.0.0.1", port: int = 5000, tracking_uri: str = "./mlruns"):
90
+ import subprocess
91
+ import sys
92
+ subprocess.run([
93
+ sys.executable, "-m", "mlflow", "ui",
94
+ "--backend-store-uri", tracking_uri, "--host", host, "--port", str(port)
95
+ ])
96
+
97
+ def _ensure_experiment(self) -> str:
98
+ experiment = mlflow.get_experiment_by_name(self.experiment_name)
99
+ if experiment is None:
100
+ return mlflow.create_experiment(self.experiment_name)
101
+ return experiment.experiment_id
102
+
103
+ def _log_exploration_params(self, findings: "ExplorationFindings") -> None:
104
+ mlflow.log_params({
105
+ "source_path": findings.source_path,
106
+ "source_format": findings.source_format,
107
+ "target_column": findings.target_column or "none",
108
+ })
109
+
110
+ def _log_exploration_metrics(self, findings: "ExplorationFindings") -> None:
111
+ mlflow.log_metrics({
112
+ "row_count": findings.row_count,
113
+ "column_count": findings.column_count,
114
+ "memory_usage_mb": findings.memory_usage_mb,
115
+ "overall_quality_score": findings.overall_quality_score,
116
+ "modeling_ready": 1.0 if findings.modeling_ready else 0.0,
117
+ "critical_issues_count": len(findings.critical_issues),
118
+ "warnings_count": len(findings.warnings),
119
+ })
120
+
121
+ def _log_column_metrics(self, findings: "ExplorationFindings") -> None:
122
+ type_counts: Dict[str, int] = {}
123
+ cleaning_needed_count = 0
124
+ for col in findings.columns.values():
125
+ type_name = col.inferred_type.value
126
+ type_counts[type_name] = type_counts.get(type_name, 0) + 1
127
+ if col.cleaning_needed:
128
+ cleaning_needed_count += 1
129
+ for type_name, count in type_counts.items():
130
+ mlflow.log_metric(f"columns_{type_name}", count)
131
+ mlflow.log_metric("columns_needing_cleaning", cleaning_needed_count)
132
+
133
+ def _set_exploration_tags(self, findings: "ExplorationFindings") -> None:
134
+ mlflow.set_tags({
135
+ "stage": "exploration",
136
+ "modeling_ready": str(findings.modeling_ready),
137
+ "is_time_series": str(findings.is_time_series),
138
+ })
139
+
140
+ def _log_pipeline_params(self, pipeline: "RecommendationPipeline") -> None:
141
+ mlflow.log_params({
142
+ "recommendation_count": len(pipeline.recommendations),
143
+ "is_fitted": pipeline._is_fitted,
144
+ })
145
+
146
+ def _log_pipeline_metrics(self, pipeline: "RecommendationPipeline") -> None:
147
+ rec_types: Dict[str, int] = {}
148
+ rec_categories: Dict[str, int] = {}
149
+ for rec in pipeline.recommendations:
150
+ rec_types[rec.recommendation_type] = rec_types.get(rec.recommendation_type, 0) + 1
151
+ rec_categories[rec.category] = rec_categories.get(rec.category, 0) + 1
152
+ for rec_type, count in rec_types.items():
153
+ mlflow.log_metric(f"rec_type_{rec_type}", count)
154
+ for category, count in rec_categories.items():
155
+ mlflow.log_metric(f"rec_category_{category}", count)
156
+
157
+ def _log_pipeline_artifacts(self, pipeline: "RecommendationPipeline") -> None:
158
+ from customer_retention.analysis.recommendations.base import Platform
159
+ mlflow.log_dict(pipeline.to_dict(), "pipeline_config.json")
160
+ mlflow.log_text(pipeline.generate_code(), "generated_code_local.py")
161
+ mlflow.log_text(pipeline.generate_code(Platform.DATABRICKS), "generated_code_databricks.py")
@@ -0,0 +1,50 @@
1
+ from typing import Any, Dict, Optional
2
+
3
+ from .base import MLflowAdapter
4
+
5
+ try:
6
+ import mlflow
7
+ from mlflow.tracking import MlflowClient
8
+ MLFLOW_AVAILABLE = True
9
+ except ImportError:
10
+ MLFLOW_AVAILABLE = False
11
+
12
+
13
+ class LocalMLflow(MLflowAdapter):
14
+ def __init__(self, tracking_uri: str = "./mlruns"):
15
+ if not MLFLOW_AVAILABLE:
16
+ raise ImportError("mlflow package required: pip install mlflow")
17
+ mlflow.set_tracking_uri(tracking_uri)
18
+ self.tracking_uri = tracking_uri
19
+ self._client = MlflowClient(tracking_uri=tracking_uri)
20
+ self._run_id = None
21
+
22
+ def start_run(self, experiment_name: str, run_name: Optional[str] = None) -> str:
23
+ experiment = mlflow.get_experiment_by_name(experiment_name)
24
+ if experiment is None:
25
+ experiment_id = mlflow.create_experiment(experiment_name)
26
+ else:
27
+ experiment_id = experiment.experiment_id
28
+ run = mlflow.start_run(experiment_id=experiment_id, run_name=run_name)
29
+ self._run_id = run.info.run_id
30
+ return self._run_id
31
+
32
+ def end_run(self) -> None:
33
+ mlflow.end_run()
34
+ self._run_id = None
35
+
36
+ def log_params(self, params: Dict[str, Any]) -> None:
37
+ mlflow.log_params(params)
38
+
39
+ def log_metrics(self, metrics: Dict[str, float]) -> None:
40
+ mlflow.log_metrics(metrics)
41
+
42
+ def log_model(self, model: Any, artifact_path: str, registered_name: Optional[str] = None) -> str:
43
+ info = mlflow.sklearn.log_model(model, artifact_path, registered_model_name=registered_name)
44
+ return info.model_uri
45
+
46
+ def load_model(self, model_uri: str) -> Any:
47
+ return mlflow.sklearn.load_model(model_uri)
48
+
49
+ def transition_stage(self, model_name: str, version: str, stage: str) -> None:
50
+ self._client.transition_model_version_stage(name=model_name, version=version, stage=stage)
@@ -0,0 +1,5 @@
1
+ from .base import DeltaStorage
2
+ from .databricks import DatabricksDelta
3
+ from .local import LocalDelta
4
+
5
+ __all__ = ["DeltaStorage", "LocalDelta", "DatabricksDelta"]
@@ -0,0 +1,33 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ import pandas as pd
5
+
6
+
7
+ class DeltaStorage(ABC):
8
+ @abstractmethod
9
+ def read(self, path: str, version: Optional[int] = None) -> pd.DataFrame:
10
+ pass
11
+
12
+ @abstractmethod
13
+ def write(self, df: pd.DataFrame, path: str, mode: str = "overwrite",
14
+ partition_by: Optional[List[str]] = None,
15
+ metadata: Optional[Dict[str, str]] = None) -> None:
16
+ pass
17
+
18
+ @abstractmethod
19
+ def merge(self, df: pd.DataFrame, path: str, condition: str,
20
+ update_cols: Optional[List[str]] = None) -> None:
21
+ pass
22
+
23
+ @abstractmethod
24
+ def history(self, path: str) -> List[Dict[str, Any]]:
25
+ pass
26
+
27
+ @abstractmethod
28
+ def vacuum(self, path: str, retention_hours: int = 168) -> None:
29
+ pass
30
+
31
+ @abstractmethod
32
+ def exists(self, path: str) -> bool:
33
+ pass
@@ -0,0 +1,76 @@
1
+ import json
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ import pandas as pd
5
+
6
+ from customer_retention.core.compat.detection import get_spark_session, is_spark_available
7
+
8
+ from .base import DeltaStorage
9
+
10
+
11
+ class DatabricksDelta(DeltaStorage):
12
+ def __init__(self):
13
+ if not is_spark_available():
14
+ raise ImportError("PySpark required for DatabricksDelta")
15
+ self._spark = None
16
+
17
+ @property
18
+ def spark(self) -> Any:
19
+ if self._spark is None:
20
+ self._spark = get_spark_session()
21
+ if self._spark is None:
22
+ from pyspark.sql import SparkSession
23
+ self._spark = SparkSession.builder.getOrCreate()
24
+ return self._spark
25
+
26
+ def read(self, path: str, version: Optional[int] = None) -> pd.DataFrame:
27
+ reader = self.spark.read.format("delta")
28
+ if version is not None:
29
+ reader = reader.option("versionAsOf", version)
30
+ return reader.load(path).toPandas()
31
+
32
+ def write(self, df: pd.DataFrame, path: str, mode: str = "overwrite",
33
+ partition_by: Optional[List[str]] = None,
34
+ metadata: Optional[Dict[str, str]] = None) -> None:
35
+ if metadata:
36
+ self.spark.conf.set(
37
+ "spark.databricks.delta.commitInfo.userMetadata",
38
+ json.dumps(metadata),
39
+ )
40
+ spark_df = self.spark.createDataFrame(df)
41
+ writer = spark_df.write.format("delta").mode(mode)
42
+ if partition_by:
43
+ writer = writer.partitionBy(*partition_by)
44
+ writer.save(path)
45
+
46
+ def merge(self, df: pd.DataFrame, path: str, condition: str,
47
+ update_cols: Optional[List[str]] = None) -> None:
48
+ from delta.tables import DeltaTable
49
+ spark_df = self.spark.createDataFrame(df)
50
+ target = DeltaTable.forPath(self.spark, path)
51
+ merge_builder = target.alias("target").merge(spark_df.alias("source"), condition)
52
+ if update_cols:
53
+ update_dict = {col: f"source.{col}" for col in update_cols}
54
+ merge_builder = merge_builder.whenMatchedUpdate(set=update_dict)
55
+ else:
56
+ merge_builder = merge_builder.whenMatchedUpdateAll()
57
+ merge_builder.whenNotMatchedInsertAll().execute()
58
+
59
+ def history(self, path: str) -> List[Dict[str, Any]]:
60
+ from delta.tables import DeltaTable
61
+ dt = DeltaTable.forPath(self.spark, path)
62
+ history_df = dt.history()
63
+ return history_df.toPandas().to_dict("records")
64
+
65
+ def vacuum(self, path: str, retention_hours: int = 168) -> None:
66
+ from delta.tables import DeltaTable
67
+ dt = DeltaTable.forPath(self.spark, path)
68
+ dt.vacuum(retention_hours)
69
+
70
+ def exists(self, path: str) -> bool:
71
+ from delta.tables import DeltaTable
72
+ try:
73
+ DeltaTable.forPath(self.spark, path)
74
+ return True
75
+ except Exception:
76
+ return False
@@ -0,0 +1,59 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ import pandas as pd
4
+
5
+ from .base import DeltaStorage
6
+
7
+ try:
8
+ import deltalake
9
+ from deltalake import DeltaTable, write_deltalake
10
+ DELTA_RS_AVAILABLE = True
11
+ except ImportError:
12
+ DELTA_RS_AVAILABLE = False
13
+
14
+
15
+ class LocalDelta(DeltaStorage):
16
+ def __init__(self):
17
+ if not DELTA_RS_AVAILABLE:
18
+ raise ImportError("deltalake package required: pip install deltalake")
19
+
20
+ def read(self, path: str, version: Optional[int] = None) -> pd.DataFrame:
21
+ if version is not None:
22
+ dt = DeltaTable(path, version=version)
23
+ else:
24
+ dt = DeltaTable(path)
25
+ return dt.to_pandas()
26
+
27
+ def write(self, df: pd.DataFrame, path: str, mode: str = "overwrite",
28
+ partition_by: Optional[List[str]] = None,
29
+ metadata: Optional[Dict[str, str]] = None) -> None:
30
+ kwargs = {"mode": mode}
31
+ if partition_by:
32
+ kwargs["partition_by"] = partition_by
33
+ if metadata:
34
+ from deltalake import CommitProperties
35
+ kwargs["commit_properties"] = CommitProperties(custom_metadata=metadata)
36
+ write_deltalake(path, df, **kwargs)
37
+
38
+ def merge(self, df: pd.DataFrame, path: str, condition: str,
39
+ update_cols: Optional[List[str]] = None) -> None:
40
+ dt = DeltaTable(path)
41
+ merge_builder = dt.merge(df, predicate=condition, source_alias="source", target_alias="target")
42
+ if update_cols:
43
+ update_dict = {col: f"source.{col}" for col in update_cols}
44
+ merge_builder = merge_builder.when_matched_update(updates=update_dict)
45
+ else:
46
+ merge_builder = merge_builder.when_matched_update_all()
47
+ merge_builder.when_not_matched_insert_all().execute()
48
+
49
+ def history(self, path: str) -> List[Dict[str, Any]]:
50
+ dt = DeltaTable(path)
51
+ return dt.history()
52
+
53
+ def vacuum(self, path: str, retention_hours: int = 168) -> None:
54
+ dt = DeltaTable(path)
55
+ dt.vacuum(retention_hours=retention_hours, enforce_retention_duration=False, dry_run=False)
56
+
57
+ def exists(self, path: str) -> bool:
58
+ from pathlib import Path
59
+ return Path(path).joinpath("_delta_log").is_dir()
@@ -0,0 +1,47 @@
1
+ """Feature store module for leakage-safe feature management.
2
+
3
+ This module provides a unified interface for managing ML features with
4
+ point-in-time correctness. It supports both local development (Feast)
5
+ and production (Databricks Feature Engineering) backends.
6
+
7
+ Key Components:
8
+ - TemporalFeatureDefinition: Feature definition with temporal metadata
9
+ - FeatureRegistry: Central registry for all feature definitions
10
+ - FeatureStoreManager: Unified interface for feature store operations
11
+
12
+ Example:
13
+ >>> from customer_retention.integrations.feature_store import (
14
+ ... FeatureStoreManager, TemporalFeatureDefinition, FeatureRegistry
15
+ ... )
16
+ >>>
17
+ >>> # Create feature definitions
18
+ >>> registry = FeatureRegistry()
19
+ >>> registry.register(TemporalFeatureDefinition(
20
+ ... name="tenure_months",
21
+ ... description="Customer tenure in months",
22
+ ... entity_key="customer_id",
23
+ ... timestamp_column="feature_timestamp",
24
+ ... source_columns=["tenure"],
25
+ ... ))
26
+ >>>
27
+ >>> # Create feature store manager
28
+ >>> manager = FeatureStoreManager.create(backend="feast")
29
+ >>> manager.publish_features(df, registry)
30
+ """
31
+
32
+ from .definitions import (
33
+ FeatureComputationType,
34
+ TemporalAggregation,
35
+ TemporalFeatureDefinition,
36
+ )
37
+ from .manager import FeatureStoreManager, get_feature_store_manager
38
+ from .registry import FeatureRegistry
39
+
40
+ __all__ = [
41
+ "TemporalFeatureDefinition",
42
+ "FeatureComputationType",
43
+ "TemporalAggregation",
44
+ "FeatureRegistry",
45
+ "FeatureStoreManager",
46
+ "get_feature_store_manager",
47
+ ]