churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,117 @@
1
+ from enum import Enum
2
+ from typing import Any, Optional
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from .source_config import DataSourceConfig
7
+
8
+
9
+ class TimestampStrategy(str, Enum):
10
+ PRODUCTION = "production"
11
+ SYNTHETIC_RANDOM = "synthetic_random"
12
+ SYNTHETIC_INDEX = "synthetic_index"
13
+ SYNTHETIC_FIXED = "synthetic_fixed"
14
+ DERIVED = "derived"
15
+
16
+
17
+ class DedupStrategy(str, Enum):
18
+ KEEP_FIRST = "keep_first"
19
+ KEEP_LAST = "keep_last"
20
+ KEEP_MOST_COMPLETE = "keep_most_complete"
21
+
22
+
23
+ class BronzeConfig(BaseModel):
24
+ deduplicate: bool = True
25
+ dedup_strategy: DedupStrategy = DedupStrategy.KEEP_LAST
26
+ dedup_keys: list[str] = ["custid"]
27
+ max_missing_pct: float = 0.95
28
+ min_distinct_values: int = 2
29
+
30
+
31
+ class SilverConfig(BaseModel):
32
+ entity_key: str = "custid"
33
+ reference_date_column: Optional[str] = None
34
+ auto_detect_encoding: bool = True
35
+ auto_detect_scaling: bool = True
36
+
37
+
38
+ class GoldConfig(BaseModel):
39
+ feature_store_catalog: str = "main"
40
+ feature_store_schema: str = "feature_store"
41
+ feature_table_name: str = "customer_features"
42
+ version: str = "v1"
43
+
44
+ def get_full_feature_table_name(self) -> str:
45
+ return f"{self.feature_store_catalog}.{self.feature_store_schema}.{self.feature_table_name}"
46
+
47
+
48
+ class ModelingConfig(BaseModel):
49
+ target_column: str = "retained"
50
+ positive_class: int = 1
51
+ test_size: float = 0.2
52
+ stratify: bool = True
53
+ primary_metric: str = "average_precision"
54
+ cost_false_negative: float = 100.0
55
+ cost_false_positive: float = 10.0
56
+
57
+ def get_cost_ratio(self) -> float:
58
+ return self.cost_false_negative / self.cost_false_positive
59
+
60
+
61
+ class ValidationConfig(BaseModel):
62
+ fail_on_critical: bool = True
63
+ fail_on_high: bool = False
64
+ leakage_correlation_threshold: float = 0.90
65
+ max_overfit_gap: float = 0.15
66
+
67
+
68
+ class TemporalConfig(BaseModel):
69
+ timestamp_strategy: TimestampStrategy = TimestampStrategy.PRODUCTION
70
+ feature_timestamp_column: Optional[str] = None
71
+ label_timestamp_column: Optional[str] = None
72
+ observation_window_days: int = 90
73
+ synthetic_base_date: str = "2024-01-01"
74
+ synthetic_range_days: int = 365
75
+ snapshot_prefix: str = "ml_training_snapshot"
76
+ enforce_point_in_time: bool = True
77
+ max_feature_target_correlation: float = 0.90
78
+ block_future_features: bool = True
79
+ derive_label_from_feature: bool = False
80
+ derivation_config: Optional[dict[str, Any]] = None
81
+
82
+
83
+ class PathConfig(BaseModel):
84
+ landing_zone: Optional[str] = None
85
+ bronze: Optional[str] = None
86
+ silver: Optional[str] = None
87
+ gold: Optional[str] = None
88
+
89
+
90
+ class PipelineConfig(BaseModel):
91
+ project_name: str
92
+ project_description: Optional[str] = None
93
+ version: str = "1.0.0"
94
+
95
+ data_sources: list[DataSourceConfig] = []
96
+ bronze: BronzeConfig = BronzeConfig()
97
+ silver: SilverConfig = SilverConfig()
98
+ gold: GoldConfig = GoldConfig()
99
+ modeling: ModelingConfig = ModelingConfig()
100
+ validation: ValidationConfig = ValidationConfig()
101
+ temporal: TemporalConfig = TemporalConfig()
102
+ paths: PathConfig = PathConfig()
103
+
104
+ def get_source_by_name(self, name: str) -> Optional[DataSourceConfig]:
105
+ return next((s for s in self.data_sources if s.name == name), None)
106
+
107
+ def get_target_source(self) -> Optional[DataSourceConfig]:
108
+ for source in self.data_sources:
109
+ if any(c.column_type.value == "target" for c in source.columns):
110
+ return source
111
+ return None
112
+
113
+ def get_all_feature_columns(self) -> list[str]:
114
+ feature_cols = []
115
+ for source in self.data_sources:
116
+ feature_cols.extend([c.name for c in source.get_feature_columns()])
117
+ return feature_cols
@@ -0,0 +1,83 @@
1
+ from enum import Enum
2
+ from typing import Optional
3
+
4
+ from pydantic import BaseModel, model_validator
5
+
6
+ from .column_config import ColumnConfig
7
+
8
+
9
+ class SourceType(str, Enum):
10
+ BATCH_FILE = "batch_file"
11
+ BATCH_TABLE = "batch_table"
12
+ STREAM = "stream"
13
+
14
+
15
+ class FileFormat(str, Enum):
16
+ CSV = "csv"
17
+ PARQUET = "parquet"
18
+ DELTA = "delta"
19
+ JSON = "json"
20
+ ORC = "orc"
21
+ AVRO = "avro"
22
+
23
+
24
+ class Grain(str, Enum):
25
+ CUSTOMER = "customer"
26
+ TRANSACTION = "transaction"
27
+ EVENT = "event"
28
+
29
+
30
+ class DataSourceConfig(BaseModel):
31
+ name: str
32
+ source_type: SourceType
33
+ primary_key: str
34
+
35
+ path: Optional[str] = None
36
+ file_format: Optional[FileFormat] = None
37
+
38
+ catalog: Optional[str] = None
39
+ schema_name: Optional[str] = None
40
+ table: Optional[str] = None
41
+
42
+ delimiter: str = ","
43
+ header: bool = True
44
+ quote_char: str = '"'
45
+ encoding: str = "utf-8"
46
+
47
+ columns: list[ColumnConfig] = []
48
+ timestamp_column: Optional[str] = None
49
+ customer_key: Optional[str] = None
50
+ grain: Optional[Grain] = None
51
+
52
+ expected_row_count_min: Optional[int] = None
53
+ expected_row_count_max: Optional[int] = None
54
+ expected_columns: Optional[list[str]] = None
55
+ freshness_sla_hours: Optional[int] = None
56
+
57
+ @model_validator(mode='after')
58
+ def validate_source_requirements(self):
59
+ if self.source_type == SourceType.BATCH_FILE:
60
+ if not self.path:
61
+ raise ValueError("path required for batch_file source_type")
62
+ if not self.file_format:
63
+ raise ValueError("file_format required for batch_file source_type")
64
+ if self.source_type == SourceType.BATCH_TABLE and not self.table:
65
+ raise ValueError("table required for batch_table source_type")
66
+ return self
67
+
68
+ def get_full_table_name(self) -> str:
69
+ if self.source_type != SourceType.BATCH_TABLE:
70
+ raise ValueError("full_table_name only applicable for batch_table")
71
+ parts = [p for p in [self.catalog, self.schema_name, self.table] if p]
72
+ return ".".join(parts)
73
+
74
+ def get_column_by_name(self, name: str) -> Optional[ColumnConfig]:
75
+ return next((c for c in self.columns if c.name == name), None)
76
+
77
+ def get_feature_columns(self) -> list[ColumnConfig]:
78
+ return [c for c in self.columns if c.should_be_used_as_feature()]
79
+
80
+ def is_cloud_path(self) -> bool:
81
+ if not self.path:
82
+ return False
83
+ return any(self.path.startswith(prefix) for prefix in ["s3://", "abfss://", "gs://", "wasb://", "adl://"])
@@ -0,0 +1,28 @@
1
+ from ..components.enums import Severity
2
+ from .leakage import (
3
+ DEFAULT_THRESHOLDS,
4
+ TEMPORAL_METADATA_COLUMNS,
5
+ LeakageThresholds,
6
+ calculate_class_overlap,
7
+ classify_correlation,
8
+ classify_separation,
9
+ get_valid_feature_columns,
10
+ )
11
+ from .severity import ThresholdConfig, classify_by_thresholds, severity_recommendation
12
+ from .statistics import (
13
+ compute_chi_square,
14
+ compute_effect_size,
15
+ compute_ks_statistic,
16
+ compute_psi_categorical,
17
+ compute_psi_from_series,
18
+ compute_psi_numeric,
19
+ )
20
+
21
+ __all__ = [
22
+ "Severity",
23
+ "compute_psi_numeric", "compute_psi_categorical", "compute_psi_from_series", "compute_ks_statistic", "compute_chi_square",
24
+ "compute_effect_size",
25
+ "LeakageThresholds", "classify_correlation", "classify_separation", "calculate_class_overlap", "DEFAULT_THRESHOLDS",
26
+ "ThresholdConfig", "classify_by_thresholds", "severity_recommendation",
27
+ "TEMPORAL_METADATA_COLUMNS", "get_valid_feature_columns",
28
+ ]
@@ -0,0 +1,85 @@
1
+ from dataclasses import dataclass
2
+ from typing import FrozenSet, List, Optional, Set, Tuple
3
+
4
+ from customer_retention.core.compat import DataFrame, Series
5
+
6
+ from ..components.enums import Severity
7
+
8
+ TEMPORAL_METADATA_COLUMNS: FrozenSet[str] = frozenset({
9
+ "feature_timestamp",
10
+ "label_timestamp",
11
+ "label_available_flag",
12
+ "event_timestamp",
13
+ })
14
+
15
+
16
+ def _build_exclusion_set(entity_column: Optional[str], target_column: Optional[str], additional_exclude: Optional[Set[str]]) -> Set[str]:
17
+ exclude = set(TEMPORAL_METADATA_COLUMNS)
18
+ if entity_column:
19
+ exclude.add(entity_column)
20
+ if target_column:
21
+ exclude.add(target_column)
22
+ if additional_exclude:
23
+ exclude.update(additional_exclude)
24
+ return exclude
25
+
26
+
27
+ def get_valid_feature_columns(
28
+ df: DataFrame,
29
+ entity_column: Optional[str] = None,
30
+ target_column: Optional[str] = None,
31
+ additional_exclude: Optional[Set[str]] = None,
32
+ ) -> List[str]:
33
+ """Filter DataFrame columns to those valid as model features."""
34
+ exclude = _build_exclusion_set(entity_column, target_column, additional_exclude)
35
+ exclude.update(c for c in df.columns if c.startswith("original_"))
36
+ return [c for c in df.columns if c not in exclude]
37
+
38
+
39
+ @dataclass
40
+ class LeakageThresholds:
41
+ correlation_critical: float = 0.90
42
+ correlation_high: float = 0.70
43
+ correlation_medium: float = 0.50
44
+ separation_critical: float = 0.0
45
+ separation_high: float = 1.0
46
+ separation_medium: float = 5.0
47
+ auc_critical: float = 0.90
48
+ auc_high: float = 0.80
49
+
50
+
51
+ DEFAULT_THRESHOLDS = LeakageThresholds()
52
+
53
+
54
+ def classify_correlation(corr: float, thresholds: LeakageThresholds = DEFAULT_THRESHOLDS) -> Tuple[Severity, str]:
55
+ abs_corr = abs(corr)
56
+ if abs_corr >= thresholds.correlation_critical:
57
+ return Severity.CRITICAL, "high_correlation"
58
+ if abs_corr >= thresholds.correlation_high:
59
+ return Severity.HIGH, "suspicious_correlation"
60
+ if abs_corr >= thresholds.correlation_medium:
61
+ return Severity.MEDIUM, "elevated_correlation"
62
+ return Severity.INFO, "normal"
63
+
64
+
65
+ def classify_separation(overlap_pct: float, thresholds: LeakageThresholds = DEFAULT_THRESHOLDS) -> Tuple[Severity, str]:
66
+ if overlap_pct <= thresholds.separation_critical:
67
+ return Severity.CRITICAL, "perfect_separation"
68
+ if overlap_pct < thresholds.separation_high:
69
+ return Severity.HIGH, "near_perfect_separation"
70
+ if overlap_pct < thresholds.separation_medium:
71
+ return Severity.MEDIUM, "high_separation"
72
+ return Severity.INFO, "normal"
73
+
74
+
75
+ def calculate_class_overlap(feature: Series, target: Series) -> float:
76
+ class_0, class_1 = feature[target == 0].dropna(), feature[target == 1].dropna()
77
+ if len(class_0) == 0 or len(class_1) == 0:
78
+ return 100.0
79
+ min_0, max_0 = class_0.min(), class_0.max()
80
+ min_1, max_1 = class_1.min(), class_1.max()
81
+ total_range = max(max_0, max_1) - min(min_0, min_1)
82
+ if total_range == 0:
83
+ return 100.0
84
+ overlap = max(0, min(max_0, max_1) - max(min_0, min_1))
85
+ return (overlap / total_range) * 100
@@ -0,0 +1,53 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ from ..components.enums import Severity
5
+
6
+
7
+ @dataclass
8
+ class ThresholdConfig:
9
+ critical: Optional[float] = None
10
+ high: Optional[float] = None
11
+ warning: Optional[float] = None
12
+ medium: Optional[float] = None
13
+ low: Optional[float] = None
14
+ ascending: bool = True
15
+
16
+
17
+ def classify_by_thresholds(value: float, config: ThresholdConfig) -> Severity:
18
+ if config.ascending:
19
+ if config.critical is not None and value >= config.critical:
20
+ return Severity.CRITICAL
21
+ if config.high is not None and value >= config.high:
22
+ return Severity.HIGH
23
+ if config.warning is not None and value >= config.warning:
24
+ return Severity.WARNING
25
+ if config.medium is not None and value >= config.medium:
26
+ return Severity.MEDIUM
27
+ if config.low is not None and value >= config.low:
28
+ return Severity.LOW
29
+ else:
30
+ if config.critical is not None and value <= config.critical:
31
+ return Severity.CRITICAL
32
+ if config.high is not None and value <= config.high:
33
+ return Severity.HIGH
34
+ if config.warning is not None and value <= config.warning:
35
+ return Severity.WARNING
36
+ if config.medium is not None and value <= config.medium:
37
+ return Severity.MEDIUM
38
+ if config.low is not None and value <= config.low:
39
+ return Severity.LOW
40
+ return Severity.INFO
41
+
42
+
43
+ def severity_recommendation(severity: Severity, context: str, action_critical: str = "investigate immediately",
44
+ action_warning: str = "monitor closely", action_info: str = "no action needed") -> str:
45
+ recommendations = {
46
+ Severity.CRITICAL: f"CRITICAL: {context}. {action_critical}.",
47
+ Severity.HIGH: f"HIGH: {context}. {action_critical}.",
48
+ Severity.WARNING: f"WARNING: {context}. {action_warning}.",
49
+ Severity.MEDIUM: f"MEDIUM: {context}. {action_warning}.",
50
+ Severity.LOW: f"LOW: {context}. {action_info}.",
51
+ Severity.INFO: f"INFO: {context}. {action_info}.",
52
+ }
53
+ return recommendations.get(severity, f"INFO: {context}.")
@@ -0,0 +1,90 @@
1
+ from typing import Dict, List, Tuple, Union
2
+
3
+ import numpy as np
4
+ from scipy import stats
5
+
6
+ from customer_retention.core.compat import Series
7
+
8
+
9
+ def _ensure_array(obj: Union[np.ndarray, List[float]]) -> np.ndarray:
10
+ return obj if isinstance(obj, np.ndarray) else np.array(obj)
11
+
12
+
13
+ def compute_effect_size(group1: Union[np.ndarray, List[float]], group2: Union[np.ndarray, List[float]]) -> Tuple[float, str]:
14
+ arr1 = _ensure_array(group1)
15
+ arr2 = _ensure_array(group2)
16
+ if len(arr1) < 2 or len(arr2) < 2:
17
+ return 0.0, "Negligible"
18
+ pooled_std = np.sqrt((np.var(arr1) + np.var(arr2)) / 2)
19
+ if pooled_std == 0:
20
+ return 0.0, "Negligible"
21
+ d = float((np.mean(arr1) - np.mean(arr2)) / pooled_std)
22
+ abs_d = abs(d)
23
+ if abs_d >= 0.8:
24
+ return d, "Large effect"
25
+ if abs_d >= 0.5:
26
+ return d, "Medium effect"
27
+ if abs_d >= 0.2:
28
+ return d, "Small effect"
29
+ return d, "Negligible"
30
+
31
+
32
+ def compute_psi_numeric(current: Series, reference_hist_edges: List[float], reference_hist_counts: List[int], epsilon: float = 1e-10) -> float:
33
+ edges = np.array(reference_hist_edges)
34
+ baseline_counts = np.array(reference_hist_counts)
35
+ current_counts, _ = np.histogram(current.dropna(), bins=edges)
36
+ baseline_prop = baseline_counts / baseline_counts.sum()
37
+ current_prop = current_counts / current_counts.sum() if current_counts.sum() > 0 else np.zeros_like(current_counts, dtype=float)
38
+ baseline_prop = np.maximum(baseline_prop, epsilon)
39
+ current_prop = np.maximum(current_prop, epsilon)
40
+ return float(np.sum((current_prop - baseline_prop) * np.log(current_prop / baseline_prop)))
41
+
42
+
43
+ def _is_categorical_dtype(dtype) -> bool:
44
+ return dtype in ['object', 'category', 'bool']
45
+
46
+
47
+ def compute_psi_from_series(reference: Series, current: Series, n_bins: int = 10, epsilon: float = 1e-10) -> float:
48
+ ref_clean, curr_clean = reference.dropna(), current.dropna()
49
+ if _is_categorical_dtype(ref_clean.dtype) or _is_categorical_dtype(curr_clean.dtype):
50
+ return compute_psi_categorical(ref_clean, curr_clean, epsilon)
51
+ min_val = min(ref_clean.min(), curr_clean.min())
52
+ max_val = max(ref_clean.max(), curr_clean.max())
53
+ bins = np.linspace(min_val, max_val, n_bins + 1)
54
+ ref_hist, _ = np.histogram(ref_clean, bins=bins)
55
+ curr_hist, _ = np.histogram(curr_clean, bins=bins)
56
+ ref_pct = ref_hist / len(ref_clean) + epsilon
57
+ curr_pct = curr_hist / len(curr_clean) + epsilon if len(curr_clean) > 0 else np.full_like(ref_hist, epsilon, dtype=float)
58
+ return float(np.sum((curr_pct - ref_pct) * np.log(curr_pct / ref_pct)))
59
+
60
+
61
+ def compute_psi_categorical(reference: Series, current: Series, epsilon: float = 1e-10) -> float:
62
+ ref_counts = reference.value_counts(normalize=True)
63
+ curr_counts = current.value_counts(normalize=True)
64
+ all_categories = set(ref_counts.index) | set(curr_counts.index)
65
+ psi = 0.0
66
+ for cat in all_categories:
67
+ ref_pct = ref_counts.get(cat, epsilon)
68
+ curr_pct = curr_counts.get(cat, epsilon)
69
+ psi += (curr_pct - ref_pct) * np.log((curr_pct + epsilon) / (ref_pct + epsilon))
70
+ return float(psi)
71
+
72
+
73
+ def compute_ks_statistic(reference: Series, current: Series) -> Tuple[float, float]:
74
+ ref_clean, curr_clean = reference.dropna(), current.dropna()
75
+ statistic, pvalue = stats.ks_2samp(ref_clean, curr_clean)
76
+ return float(statistic), float(pvalue)
77
+
78
+
79
+ def compute_chi_square(current: Series, baseline_proportions: Dict[str, float]) -> Tuple[float, float]:
80
+ current_counts = current.value_counts()
81
+ all_categories = sorted(set(list(current_counts.index) + list(baseline_proportions.keys())))
82
+ observed, expected = [], []
83
+ total_current = len(current)
84
+ for cat in all_categories:
85
+ observed.append(current_counts.get(cat, 0))
86
+ expected.append(max(baseline_proportions.get(cat, 0) * total_current, 1e-10))
87
+ expected_arr = np.array(expected)
88
+ expected_arr = expected_arr * (sum(observed) / sum(expected_arr)) if sum(expected_arr) > 0 else expected_arr
89
+ chi_square, pvalue = stats.chisquare(observed, expected_arr)
90
+ return float(chi_square), float(pvalue)
File without changes
@@ -0,0 +1,167 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, Dict, List, Optional
4
+
5
+ from .base import NotebookGenerator, NotebookStage
6
+
7
+ if TYPE_CHECKING:
8
+ from customer_retention.analysis.auto_explorer import ExplorationFindings
9
+ from .cell_builder import CellBuilder
10
+ from .config import FeatureStoreConfig, MLflowConfig, NotebookConfig, OutputFormat, Platform
11
+ from .databricks_generator import DatabricksNotebookGenerator
12
+ from .local_generator import LocalNotebookGenerator
13
+ from .project_init import ProjectInitializer, initialize_project
14
+ from .runner import (
15
+ NotebookRunner,
16
+ NotebookValidationResult,
17
+ ScriptRunner,
18
+ ValidationReport,
19
+ validate_generated_notebooks,
20
+ )
21
+ from .script_generator import DatabricksScriptGenerator, LocalScriptGenerator, ScriptGenerator
22
+
23
+
24
+ @dataclass
25
+ class GenerationResult:
26
+ platform: Platform
27
+ notebook_paths: List[str]
28
+ validation_report: Optional[ValidationReport] = None
29
+
30
+ @property
31
+ def all_valid(self) -> bool:
32
+ return self.validation_report.all_passed if self.validation_report else True
33
+
34
+
35
+ def generate_orchestration_notebooks(
36
+ findings_path: Optional[str] = None,
37
+ output_dir: str = "./generated_pipelines",
38
+ platforms: Optional[List[Platform]] = None,
39
+ config: Optional[NotebookConfig] = None,
40
+ validate: bool = False,
41
+ ) -> Dict[Platform, List[str]]:
42
+ if platforms is None:
43
+ platforms = [Platform.LOCAL, Platform.DATABRICKS]
44
+ if config is None:
45
+ config = NotebookConfig()
46
+
47
+ findings = None
48
+ if findings_path:
49
+ from customer_retention.analysis.auto_explorer import ExplorationFindings
50
+ findings = ExplorationFindings.load(findings_path)
51
+
52
+ results = {}
53
+ for platform in platforms:
54
+ generator = create_notebook_generator(platform, findings, config)
55
+ platform_dir = str(Path(output_dir) / platform.value)
56
+ saved_paths = generator.save_all(platform_dir)
57
+ results[platform] = saved_paths
58
+
59
+ return results
60
+
61
+
62
+ def generate_and_validate_notebooks(
63
+ findings_path: Optional[str] = None,
64
+ output_dir: str = "./generated_pipelines",
65
+ platforms: Optional[List[Platform]] = None,
66
+ config: Optional[NotebookConfig] = None,
67
+ ) -> Dict[Platform, GenerationResult]:
68
+ if platforms is None:
69
+ platforms = [Platform.LOCAL, Platform.DATABRICKS]
70
+ if config is None:
71
+ config = NotebookConfig()
72
+
73
+ findings = None
74
+ if findings_path:
75
+ from customer_retention.analysis.auto_explorer import ExplorationFindings
76
+ findings = ExplorationFindings.load(findings_path)
77
+
78
+ results = {}
79
+ runner = NotebookRunner(dry_run=True)
80
+
81
+ for platform in platforms:
82
+ generator = create_notebook_generator(platform, findings, config)
83
+ platform_dir = str(Path(output_dir) / platform.value)
84
+ saved_paths = generator.save_all(platform_dir)
85
+ validation_report = runner.validate_sequence(platform_dir, platform.value)
86
+ results[platform] = GenerationResult(platform, saved_paths, validation_report)
87
+ save_validation_report(platform_dir, validation_report)
88
+
89
+ return results
90
+
91
+
92
+ def save_validation_report(output_dir: str, report: ValidationReport) -> str:
93
+ report_path = Path(output_dir) / "VALIDATION_REPORT.md"
94
+ report_path.write_text(report.to_markdown())
95
+ return str(report_path)
96
+
97
+
98
+ def create_notebook_generator(
99
+ platform: Platform,
100
+ findings: Optional["ExplorationFindings"] = None,
101
+ config: Optional[NotebookConfig] = None,
102
+ ) -> NotebookGenerator:
103
+ if config is None:
104
+ config = NotebookConfig()
105
+
106
+ if platform == Platform.LOCAL:
107
+ return LocalNotebookGenerator(config, findings)
108
+ elif platform == Platform.DATABRICKS:
109
+ return DatabricksNotebookGenerator(config, findings)
110
+ else:
111
+ raise ValueError(f"Unsupported platform: {platform}")
112
+
113
+
114
+ def create_script_generator(
115
+ platform: Platform,
116
+ findings: Optional["ExplorationFindings"] = None,
117
+ config: Optional[NotebookConfig] = None,
118
+ ) -> ScriptGenerator:
119
+ if config is None:
120
+ config = NotebookConfig()
121
+
122
+ if platform == Platform.LOCAL:
123
+ return LocalScriptGenerator(config, findings)
124
+ elif platform == Platform.DATABRICKS:
125
+ return DatabricksScriptGenerator(config, findings)
126
+ else:
127
+ raise ValueError(f"Unsupported platform: {platform}")
128
+
129
+
130
+ def generate_orchestration_scripts(
131
+ findings_path: Optional[str] = None,
132
+ output_dir: str = "./generated_pipelines/scripts",
133
+ platforms: Optional[List[Platform]] = None,
134
+ config: Optional[NotebookConfig] = None,
135
+ ) -> Dict[Platform, List[str]]:
136
+ if platforms is None:
137
+ platforms = [Platform.LOCAL, Platform.DATABRICKS]
138
+ if config is None:
139
+ config = NotebookConfig()
140
+
141
+ findings = None
142
+ if findings_path:
143
+ from customer_retention.analysis.auto_explorer import ExplorationFindings
144
+ findings = ExplorationFindings.load(findings_path)
145
+
146
+ results = {}
147
+ for platform in platforms:
148
+ generator = create_script_generator(platform, findings, config)
149
+ platform_dir = str(Path(output_dir) / platform.value)
150
+ saved_paths = generator.save_all(platform_dir)
151
+ results[platform] = saved_paths
152
+
153
+ return results
154
+
155
+
156
+ __all__ = [
157
+ "NotebookGenerator", "NotebookStage", "NotebookConfig", "Platform",
158
+ "MLflowConfig", "FeatureStoreConfig", "CellBuilder", "OutputFormat",
159
+ "LocalNotebookGenerator", "DatabricksNotebookGenerator",
160
+ "NotebookRunner", "NotebookValidationResult", "ValidationReport", "ScriptRunner",
161
+ "GenerationResult", "generate_orchestration_notebooks",
162
+ "generate_and_validate_notebooks", "create_notebook_generator",
163
+ "validate_generated_notebooks", "save_validation_report",
164
+ "ScriptGenerator", "LocalScriptGenerator", "DatabricksScriptGenerator",
165
+ "create_script_generator", "generate_orchestration_scripts",
166
+ "ProjectInitializer", "initialize_project",
167
+ ]
@@ -0,0 +1,55 @@
1
+ from abc import ABC, abstractmethod
2
+ from enum import Enum
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Dict, List, Optional
5
+
6
+ import nbformat
7
+
8
+ from .config import NotebookConfig
9
+
10
+ if TYPE_CHECKING:
11
+ from customer_retention.analysis.auto_explorer import ExplorationFindings
12
+
13
+
14
+ class NotebookStage(str, Enum):
15
+ INGESTION = "01_ingestion"
16
+ PROFILING = "02_profiling"
17
+ CLEANING = "03_cleaning"
18
+ TRANSFORMATION = "04_transformation"
19
+ FEATURE_ENGINEERING = "05_feature_engineering"
20
+ FEATURE_SELECTION = "06_feature_selection"
21
+ MODEL_TRAINING = "07_model_training"
22
+ DEPLOYMENT = "08_deployment"
23
+ MONITORING = "09_monitoring"
24
+ BATCH_INFERENCE = "10_batch_inference"
25
+ FEATURE_STORE = "11_feature_store"
26
+
27
+
28
+ class NotebookGenerator(ABC):
29
+ def __init__(self, config: NotebookConfig, findings: Optional["ExplorationFindings"]):
30
+ self.config = config
31
+ self.findings = findings
32
+
33
+ @abstractmethod
34
+ def generate_stage(self, stage: NotebookStage) -> nbformat.NotebookNode:
35
+ pass
36
+
37
+ @property
38
+ def available_stages(self) -> List[NotebookStage]:
39
+ if hasattr(self, "stage_generators"):
40
+ return list(self.stage_generators.keys())
41
+ return list(NotebookStage)
42
+
43
+ def generate_all(self) -> Dict[NotebookStage, nbformat.NotebookNode]:
44
+ return {stage: self.generate_stage(stage) for stage in self.available_stages}
45
+
46
+ def save_all(self, output_dir: str) -> List[str]:
47
+ output_path = Path(output_dir)
48
+ output_path.mkdir(parents=True, exist_ok=True)
49
+ saved_paths = []
50
+ for stage, notebook in self.generate_all().items():
51
+ file_path = output_path / f"{stage.value}.ipynb"
52
+ with open(file_path, "w", encoding="utf-8") as f:
53
+ nbformat.write(notebook, f)
54
+ saved_paths.append(str(file_path))
55
+ return saved_paths