churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,86 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ import nbformat
5
+
6
+ from ..base import NotebookStage
7
+
8
+ if TYPE_CHECKING:
9
+ from customer_retention.analysis.auto_explorer import ExplorationFindings
10
+ from ..cell_builder import CellBuilder
11
+ from ..config import NotebookConfig, Platform
12
+
13
+
14
+ class StageGenerator(ABC):
15
+ def __init__(self, config: NotebookConfig, findings: Optional["ExplorationFindings"]):
16
+ self.config = config
17
+ self.findings = findings
18
+ self.cb = CellBuilder
19
+
20
+ @property
21
+ @abstractmethod
22
+ def stage(self) -> NotebookStage:
23
+ pass
24
+
25
+ @property
26
+ @abstractmethod
27
+ def title(self) -> str:
28
+ pass
29
+
30
+ @property
31
+ def description(self) -> str:
32
+ return ""
33
+
34
+ def generate(self, platform: Platform) -> List[nbformat.NotebookNode]:
35
+ if platform == Platform.LOCAL:
36
+ return self.generate_local_cells()
37
+ return self.generate_databricks_cells()
38
+
39
+ @abstractmethod
40
+ def generate_local_cells(self) -> List[nbformat.NotebookNode]:
41
+ pass
42
+
43
+ @abstractmethod
44
+ def generate_databricks_cells(self) -> List[nbformat.NotebookNode]:
45
+ pass
46
+
47
+ def header_cells(self) -> List[nbformat.NotebookNode]:
48
+ cells = [self.cb.header(self.title)]
49
+ if self.description:
50
+ cells.append(self.cb.markdown(self.description))
51
+ return cells
52
+
53
+ def get_target_column(self) -> str:
54
+ if self.findings and hasattr(self.findings, "target_column") and self.findings.target_column:
55
+ return self.findings.target_column
56
+ return "target"
57
+
58
+ def get_identifier_columns(self) -> List[str]:
59
+ if self.findings and hasattr(self.findings, "identifier_columns") and self.findings.identifier_columns:
60
+ return self.findings.identifier_columns
61
+ return ["customer_id"]
62
+
63
+ def get_feature_columns(self) -> List[str]:
64
+ if not self.findings or not hasattr(self.findings, "columns"):
65
+ return []
66
+ from customer_retention.core.config import ColumnType
67
+ feature_types = {ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE,
68
+ ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL, ColumnType.BINARY}
69
+ return [name for name, col in self.findings.columns.items()
70
+ if hasattr(col, "inferred_type") and col.inferred_type in feature_types]
71
+
72
+ def get_numeric_columns(self) -> List[str]:
73
+ if not self.findings or not hasattr(self.findings, "columns"):
74
+ return []
75
+ from customer_retention.core.config import ColumnType
76
+ return [name for name, col in self.findings.columns.items()
77
+ if hasattr(col, "inferred_type") and col.inferred_type in
78
+ {ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE}]
79
+
80
+ def get_categorical_columns(self) -> List[str]:
81
+ if not self.findings or not hasattr(self.findings, "columns"):
82
+ return []
83
+ from customer_retention.core.config import ColumnType
84
+ return [name for name, col in self.findings.columns.items()
85
+ if hasattr(col, "inferred_type") and col.inferred_type in
86
+ {ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL}]
@@ -0,0 +1,100 @@
1
+ from typing import List
2
+
3
+ import nbformat
4
+
5
+ from ..base import NotebookStage
6
+ from .base_stage import StageGenerator
7
+
8
+
9
+ class IngestionStage(StageGenerator):
10
+ @property
11
+ def stage(self) -> NotebookStage:
12
+ return NotebookStage.INGESTION
13
+
14
+ @property
15
+ def title(self) -> str:
16
+ return "01 - Configuration & Data Ingestion"
17
+
18
+ @property
19
+ def description(self) -> str:
20
+ return "Load raw data, configure pipeline context, and save to bronze layer."
21
+
22
+ def generate_local_cells(self) -> List[nbformat.NotebookNode]:
23
+ findings_path = self.findings.source_path if self.findings else "./data/customers.csv"
24
+ source_format = getattr(self.findings, "source_format", "csv") if self.findings else "csv"
25
+ return self.header_cells() + [
26
+ self.cb.section("Imports"),
27
+ self.cb.from_imports_cell({
28
+ "customer_retention.generators.orchestration": ["setup_notebook_context", "PipelineContext"],
29
+ "customer_retention.stages.ingestion": ["DataSourceRegistry"],
30
+ "customer_retention.analysis.auto_explorer": ["ExplorationFindings"],
31
+ "customer_retention.stages.temporal": ["ScenarioDetector", "UnifiedDataPreparer"],
32
+ "datetime": ["datetime"],
33
+ "pathlib": ["Path"],
34
+ }),
35
+ self.cb.section("Configuration"),
36
+ self.cb.code(f'''FINDINGS_PATH = "{findings_path}"
37
+ DATA_FORMAT = "{source_format}"
38
+ OUTPUT_DIR = Path("./experiments/data")'''),
39
+ self.cb.section("Load Exploration Findings"),
40
+ self.cb.code('''findings = ExplorationFindings.load(FINDINGS_PATH)
41
+ print(f"Loaded findings: {findings.row_count} rows, {findings.column_count} columns")
42
+ print(f"Target column: {findings.target_column}")'''),
43
+ self.cb.section("Setup Pipeline Context"),
44
+ self.cb.code('''ctx, manager = setup_notebook_context(exploration_findings=findings)
45
+ print(f"Pipeline context initialized for: {ctx.config.project_name}")'''),
46
+ self.cb.section("Load Raw Data"),
47
+ self.cb.code('''registry = DataSourceRegistry()
48
+ df = registry.load(findings.source_path, format=DATA_FORMAT)
49
+ print(f"Loaded {len(df)} rows")
50
+ df.head()'''),
51
+ self.cb.section("Detect Timestamp Scenario"),
52
+ self.cb.code('''detector = ScenarioDetector()
53
+ scenario, ts_config, discovery_result = detector.detect(df, findings.target_column)
54
+ print(f"Detected scenario: {scenario}")
55
+ print(f"Strategy: {ts_config.strategy.value}")
56
+ print(f"Recommendation: {discovery_result.recommendation}")'''),
57
+ self.cb.section("Prepare Data with Timestamps"),
58
+ self.cb.code('''preparer = UnifiedDataPreparer(OUTPUT_DIR, ts_config)
59
+ unified_df = preparer.prepare_from_raw(
60
+ df,
61
+ target_column=findings.target_column,
62
+ entity_column=findings.entity_id_column or "custid"
63
+ )
64
+ print(f"Prepared {len(unified_df)} rows with timestamps")
65
+ print(f"Timestamp columns: feature_timestamp, label_timestamp, label_available_flag")'''),
66
+ self.cb.section("Create Training Snapshot"),
67
+ self.cb.code('''cutoff_date = datetime.now()
68
+ snapshot_df, metadata = preparer.create_training_snapshot(unified_df, cutoff_date)
69
+ print(f"Created snapshot: {metadata['snapshot_id']}")
70
+ print(f"Rows: {metadata['row_count']}")
71
+ print(f"Features: {len(metadata['feature_columns'])}")'''),
72
+ self.cb.section("Save Processed Data"),
73
+ self.cb.code('''manager.update(current_df=snapshot_df, current_stage="bronze")
74
+ print(f"Pipeline context updated. Use snapshot '{metadata['snapshot_id']}' for training.")'''),
75
+ ]
76
+
77
+ def generate_databricks_cells(self) -> List[nbformat.NotebookNode]:
78
+ catalog = self.config.feature_store.catalog
79
+ schema = self.config.feature_store.schema
80
+ data_path = self.findings.source_path if self.findings else "/mnt/landing/customers"
81
+ source_format = getattr(self.findings, "source_format", "csv") if self.findings else "csv"
82
+ return self.header_cells() + [
83
+ self.cb.section("Configuration"),
84
+ self.cb.code(f'''CATALOG = "{catalog}"
85
+ SCHEMA = "{schema}"
86
+ DATA_PATH = "{data_path}"
87
+ spark.sql(f"USE CATALOG {{CATALOG}}")
88
+ spark.sql(f"USE SCHEMA {{SCHEMA}}")'''),
89
+ self.cb.section("Load Raw Data"),
90
+ self.cb.code(f'''df_raw = (spark.read
91
+ .format("{source_format}")
92
+ .option("header", "true")
93
+ .option("inferSchema", "true")
94
+ .load(DATA_PATH))
95
+ print(f"Loaded {{df_raw.count()}} rows")
96
+ display(df_raw.limit(10))'''),
97
+ self.cb.section("Save to Bronze Table"),
98
+ self.cb.code('''df_raw.write.format("delta").mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.bronze_customers")
99
+ print("Bronze table created")'''),
100
+ ]
@@ -0,0 +1,95 @@
1
+ from typing import List
2
+
3
+ import nbformat
4
+
5
+ from ..base import NotebookStage
6
+ from .base_stage import StageGenerator
7
+
8
+
9
+ class ProfilingStage(StageGenerator):
10
+ @property
11
+ def stage(self) -> NotebookStage:
12
+ return NotebookStage.PROFILING
13
+
14
+ @property
15
+ def title(self) -> str:
16
+ return "02 - Data Profiling"
17
+
18
+ @property
19
+ def description(self) -> str:
20
+ return "Generate column statistics, type detection, and quality metrics."
21
+
22
+ def generate_local_cells(self) -> List[nbformat.NotebookNode]:
23
+ return self.header_cells() + [
24
+ self.cb.section("Imports"),
25
+ self.cb.from_imports_cell({
26
+ "customer_retention.stages.profiling": ["TypeDetector", "ProfilerFactory", "QualityCheckRegistry"],
27
+ "customer_retention.analysis.visualization": ["ChartBuilder"],
28
+ "pandas": ["pd"],
29
+ }),
30
+ self.cb.section("Load Bronze Data"),
31
+ self.cb.code('''from customer_retention.integrations.adapters.factory import get_delta
32
+ storage = get_delta(force_local=True)
33
+ df = storage.read("./experiments/data/bronze/customers")
34
+ print(f"Loaded {len(df)} rows, {len(df.columns)} columns")'''),
35
+ self.cb.section("Type Detection"),
36
+ self.cb.code('''detector = TypeDetector()
37
+ type_results = {col: detector.detect(df[col]) for col in df.columns}
38
+ for col, result in type_results.items():
39
+ print(f"{col}: {result.column_type.value} (confidence: {result.confidence:.2f})")'''),
40
+ self.cb.section("Column Profiling"),
41
+ self.cb.code('''factory = ProfilerFactory()
42
+ profiles = {}
43
+ for col in df.columns:
44
+ profiler = factory.get_profiler(type_results[col].column_type)
45
+ profiles[col] = profiler.profile(df[col])'''),
46
+ self.cb.section("Quality Checks"),
47
+ self.cb.code('''registry = QualityCheckRegistry()
48
+ checks = registry.get_all_checks()
49
+ results = []
50
+ for check in checks:
51
+ for col in df.columns:
52
+ result = check.check(df[col], profiles.get(col))
53
+ if result.passed is False:
54
+ results.append({"column": col, "check": check.name, "severity": result.severity.value, "message": result.message})
55
+ quality_df = pd.DataFrame(results)
56
+ quality_df'''),
57
+ self.cb.section("Visualize Quality"),
58
+ self.cb.code('''charts = ChartBuilder()
59
+ if len(quality_df) > 0:
60
+ fig = charts.quality_heatmap(quality_df)
61
+ fig.show()'''),
62
+ ]
63
+
64
+ def generate_databricks_cells(self) -> List[nbformat.NotebookNode]:
65
+ catalog = self.config.feature_store.catalog
66
+ schema = self.config.feature_store.schema
67
+ return self.header_cells() + [
68
+ self.cb.section("Load Bronze Data"),
69
+ self.cb.code(f'''df = spark.table("{catalog}.{schema}.bronze_customers")
70
+ print(f"Loaded {{df.count()}} rows")'''),
71
+ self.cb.section("Basic Statistics"),
72
+ self.cb.code('''summary = df.describe()
73
+ display(summary)'''),
74
+ self.cb.section("Column Types and Nulls"),
75
+ self.cb.code('''from pyspark.sql.functions import col, count, when, isnan
76
+
77
+ null_counts = df.select([
78
+ count(when(col(c).isNull() | isnan(col(c)), c)).alias(c)
79
+ for c in df.columns
80
+ ])
81
+ display(null_counts)'''),
82
+ self.cb.section("Distinct Values"),
83
+ self.cb.code('''from pyspark.sql.functions import countDistinct
84
+
85
+ distinct_counts = df.select([countDistinct(col(c)).alias(c) for c in df.columns])
86
+ display(distinct_counts)'''),
87
+ self.cb.section("Save Profiling Results"),
88
+ self.cb.code('''profile_data = {
89
+ "columns": df.columns,
90
+ "dtypes": [str(f.dataType) for f in df.schema.fields],
91
+ "row_count": df.count()
92
+ }
93
+ import json
94
+ dbutils.fs.put("/tmp/profile_results.json", json.dumps(profile_data), overwrite=True)'''),
95
+ ]
@@ -0,0 +1,180 @@
1
+ from typing import List
2
+
3
+ import nbformat
4
+
5
+ from ..base import NotebookStage
6
+ from .base_stage import StageGenerator
7
+
8
+
9
+ class CleaningStage(StageGenerator):
10
+ @property
11
+ def stage(self) -> NotebookStage:
12
+ return NotebookStage.CLEANING
13
+
14
+ @property
15
+ def title(self) -> str:
16
+ return "03 - Data Cleaning"
17
+
18
+ @property
19
+ def description(self) -> str:
20
+ return "Handle missing values and outliers based on column types with MLflow tracking."
21
+
22
+ def _get_cleaning_recommendations(self) -> dict:
23
+ recommendations = {}
24
+ if not self.findings or not hasattr(self.findings, "columns"):
25
+ return recommendations
26
+ for col_name, col_finding in self.findings.columns.items():
27
+ if hasattr(col_finding, "cleaning_recommendations") and col_finding.cleaning_recommendations:
28
+ recommendations[col_name] = col_finding.cleaning_recommendations
29
+ return recommendations
30
+
31
+ def generate_local_cells(self) -> List[nbformat.NotebookNode]:
32
+ numeric_cols = self.get_numeric_columns()
33
+ categorical_cols = self.get_categorical_columns()
34
+ tracking_uri = self.config.mlflow.tracking_uri
35
+ exp_name = self.config.mlflow.experiment_name
36
+ cleaning_recs = self._get_cleaning_recommendations()
37
+
38
+ cells = self.header_cells() + [
39
+ self.cb.section("Imports"),
40
+ self.cb.from_imports_cell({
41
+ "customer_retention.stages.cleaning": ["MissingValueHandler", "OutlierHandler"],
42
+ "customer_retention.integrations.adapters": ["get_mlflow"],
43
+ "pandas": ["pd"],
44
+ }),
45
+ self.cb.section("Setup MLflow Tracking"),
46
+ self.cb.code(f'''mlflow_adapter = get_mlflow(tracking_uri="{tracking_uri}", force_local=True)
47
+ mlflow_adapter.start_run("{exp_name}", run_name="03_data_cleaning")
48
+ cleaning_stats = {{}}'''),
49
+ self.cb.section("Load Bronze Data"),
50
+ self.cb.code('''from customer_retention.integrations.adapters.factory import get_delta
51
+ storage = get_delta(force_local=True)
52
+ df = storage.read("./experiments/data/bronze/customers")
53
+ initial_shape = df.shape
54
+ initial_nulls = df.isnull().sum().sum()
55
+ print(f"Initial shape: {df.shape}")
56
+ print(f"Total missing values: {initial_nulls}")
57
+
58
+ mlflow_adapter.log_metrics({
59
+ "bronze_rows": initial_shape[0],
60
+ "bronze_columns": initial_shape[1],
61
+ "bronze_total_nulls": initial_nulls,
62
+ })'''),
63
+ ]
64
+
65
+ if cleaning_recs:
66
+ cells.append(self.cb.section("Apply Cleaning from Exploration Findings"))
67
+ cells.append(self.cb.code(f'''cleaning_recommendations = {cleaning_recs}
68
+ print(f"Found cleaning recommendations for {{len(cleaning_recommendations)}} columns")'''))
69
+
70
+ cells.extend([
71
+ self.cb.section("Handle Missing Values - Numeric Columns"),
72
+ self.cb.code(f'''numeric_cols = {numeric_cols}
73
+ missing_handler = MissingValueHandler(strategy="median")
74
+ for col in numeric_cols:
75
+ if col in df.columns and df[col].isnull().any():
76
+ nulls_before = df[col].isnull().sum()
77
+ df[col] = missing_handler.fit_transform(df[col])
78
+ cleaning_stats[f"{{col}}_nulls_imputed"] = nulls_before
79
+ print(f"Imputed {{col}}: {{nulls_before}} missing values")'''),
80
+ self.cb.section("Handle Missing Values - Categorical Columns"),
81
+ self.cb.code(f'''categorical_cols = {categorical_cols}
82
+ missing_handler_cat = MissingValueHandler(strategy="mode")
83
+ for col in categorical_cols:
84
+ if col in df.columns and df[col].isnull().any():
85
+ nulls_before = df[col].isnull().sum()
86
+ df[col] = missing_handler_cat.fit_transform(df[col])
87
+ cleaning_stats[f"{{col}}_nulls_imputed"] = nulls_before
88
+ print(f"Imputed {{col}}: {{nulls_before}} missing values")'''),
89
+ self.cb.section("Handle Outliers"),
90
+ self.cb.code('''outlier_handler = OutlierHandler(method="iqr", treatment="cap")
91
+ for col in numeric_cols:
92
+ if col in df.columns:
93
+ q1, q3 = df[col].quantile([0.25, 0.75])
94
+ iqr = q3 - q1
95
+ outliers = ((df[col] < q1 - 1.5*iqr) | (df[col] > q3 + 1.5*iqr)).sum()
96
+ cleaning_stats[f"{col}_outliers_capped"] = outliers
97
+ df[col] = outlier_handler.fit_transform(df[col])
98
+ print("Outliers capped using IQR method")'''),
99
+ self.cb.section("Log Cleaning Statistics to MLflow"),
100
+ self.cb.code('''final_nulls = df.isnull().sum().sum()
101
+ mlflow_adapter.log_params({
102
+ "numeric_strategy": "median",
103
+ "categorical_strategy": "mode",
104
+ "outlier_method": "iqr",
105
+ "outlier_treatment": "cap",
106
+ })
107
+ mlflow_adapter.log_metrics({
108
+ "silver_rows": df.shape[0],
109
+ "silver_columns": df.shape[1],
110
+ "silver_total_nulls": final_nulls,
111
+ "nulls_removed": initial_nulls - final_nulls,
112
+ **{k: v for k, v in cleaning_stats.items() if isinstance(v, (int, float))}
113
+ })
114
+ print(f"Logged {len(cleaning_stats)} cleaning statistics to MLflow")'''),
115
+ self.cb.section("Save to Silver Layer"),
116
+ self.cb.code('''storage.write(df, "./experiments/data/silver/customers_cleaned")
117
+ mlflow_adapter.end_run()
118
+ print(f"Silver layer saved: {df.shape}")'''),
119
+ ])
120
+ return cells
121
+
122
+ def generate_databricks_cells(self) -> List[nbformat.NotebookNode]:
123
+ catalog = self.config.feature_store.catalog
124
+ schema = self.config.feature_store.schema
125
+ exp_name = self.config.mlflow.experiment_name
126
+ numeric_cols = self.get_numeric_columns()
127
+ categorical_cols = self.get_categorical_columns()
128
+ return self.header_cells() + [
129
+ self.cb.section("Setup MLflow Tracking"),
130
+ self.cb.code(f'''import mlflow
131
+
132
+ mlflow.set_experiment("/Users/{{spark.conf.get('spark.databricks.notebook.username', 'default')}}/{exp_name}")
133
+ mlflow.start_run(run_name="03_data_cleaning")
134
+ cleaning_stats = {{}}'''),
135
+ self.cb.section("Load Bronze Data"),
136
+ self.cb.code(f'''df = spark.table("{catalog}.{schema}.bronze_customers")
137
+ initial_count = df.count()
138
+ print(f"Initial count: {{initial_count}}")
139
+ mlflow.log_metric("bronze_rows", initial_count)'''),
140
+ self.cb.section("Handle Missing Values - Numeric Columns"),
141
+ self.cb.code(f'''from pyspark.sql.functions import col, when, lit, sum as spark_sum
142
+ from pyspark.ml.feature import Imputer
143
+
144
+ numeric_cols = {numeric_cols}
145
+ imputer = Imputer(inputCols=numeric_cols, outputCols=numeric_cols, strategy="median")
146
+ df = imputer.fit(df).transform(df)
147
+ mlflow.log_param("numeric_strategy", "median")
148
+ print("Numeric columns imputed with median")'''),
149
+ self.cb.section("Handle Missing Values - Categorical Columns"),
150
+ self.cb.code(f'''categorical_cols = {categorical_cols}
151
+ for col_name in categorical_cols:
152
+ mode_val = df.groupBy(col_name).count().orderBy("count", ascending=False).first()[0]
153
+ df = df.fillna({{col_name: mode_val}})
154
+ mlflow.log_param("categorical_strategy", "mode")
155
+ print("Categorical columns imputed with mode")'''),
156
+ self.cb.section("Handle Outliers with IQR"),
157
+ self.cb.code('''for col_name in numeric_cols:
158
+ quantiles = df.approxQuantile(col_name, [0.25, 0.75], 0.05)
159
+ if len(quantiles) == 2:
160
+ q1, q3 = quantiles
161
+ iqr = q3 - q1
162
+ lower = q1 - 1.5 * iqr
163
+ upper = q3 + 1.5 * iqr
164
+ df = df.withColumn(col_name, when(col(col_name) < lower, lower)
165
+ .when(col(col_name) > upper, upper)
166
+ .otherwise(col(col_name)))
167
+ mlflow.log_params({"outlier_method": "iqr", "outlier_treatment": "cap"})
168
+ print("Outliers capped using IQR")'''),
169
+ self.cb.section("Log Cleaning Statistics"),
170
+ self.cb.code('''final_count = df.count()
171
+ mlflow.log_metrics({
172
+ "silver_rows": final_count,
173
+ "rows_preserved_pct": final_count / initial_count * 100,
174
+ })
175
+ print(f"Final count: {final_count}")'''),
176
+ self.cb.section("Save to Silver Table"),
177
+ self.cb.code(f'''df.write.format("delta").mode("overwrite").saveAsTable("{catalog}.{schema}.silver_customers")
178
+ mlflow.end_run()
179
+ print("Silver table created")'''),
180
+ ]
@@ -0,0 +1,165 @@
1
+ from typing import List
2
+
3
+ import nbformat
4
+
5
+ from ..base import NotebookStage
6
+ from .base_stage import StageGenerator
7
+
8
+
9
+ class TransformationStage(StageGenerator):
10
+ @property
11
+ def stage(self) -> NotebookStage:
12
+ return NotebookStage.TRANSFORMATION
13
+
14
+ @property
15
+ def title(self) -> str:
16
+ return "04 - Data Transformation"
17
+
18
+ @property
19
+ def description(self) -> str:
20
+ return "Apply scaling, encoding, and type transformations with MLflow tracking."
21
+
22
+ def _get_transform_recommendations(self) -> dict:
23
+ recommendations = {}
24
+ if not self.findings or not hasattr(self.findings, "columns"):
25
+ return recommendations
26
+ for col_name, col_finding in self.findings.columns.items():
27
+ if hasattr(col_finding, "transformation_recommendations") and col_finding.transformation_recommendations:
28
+ recommendations[col_name] = col_finding.transformation_recommendations
29
+ return recommendations
30
+
31
+ def generate_local_cells(self) -> List[nbformat.NotebookNode]:
32
+ numeric_cols = self.get_numeric_columns()
33
+ categorical_cols = self.get_categorical_columns()
34
+ tracking_uri = self.config.mlflow.tracking_uri
35
+ exp_name = self.config.mlflow.experiment_name
36
+ transform_recs = self._get_transform_recommendations()
37
+
38
+ cells = self.header_cells() + [
39
+ self.cb.section("Imports"),
40
+ self.cb.from_imports_cell({
41
+ "customer_retention.stages.transformation": ["NumericTransformer", "CategoricalEncoder"],
42
+ "customer_retention.stages.preprocessing": ["TransformerManager"],
43
+ "customer_retention.integrations.adapters": ["get_mlflow"],
44
+ "pandas": ["pd"],
45
+ }),
46
+ self.cb.section("Setup MLflow Tracking"),
47
+ self.cb.code(f'''mlflow_adapter = get_mlflow(tracking_uri="{tracking_uri}", force_local=True)
48
+ mlflow_adapter.start_run("{exp_name}", run_name="04_transformation")
49
+ transform_stats = {{}}'''),
50
+ self.cb.section("Load Silver Data"),
51
+ self.cb.code('''from customer_retention.integrations.adapters.factory import get_delta
52
+ storage = get_delta(force_local=True)
53
+ df = storage.read("./experiments/data/silver/customers_cleaned")
54
+ print(f"Loaded shape: {df.shape}")
55
+ mlflow_adapter.log_metric("input_rows", df.shape[0])
56
+ mlflow_adapter.log_metric("input_columns", df.shape[1])'''),
57
+ ]
58
+
59
+ if transform_recs:
60
+ cells.append(self.cb.section("Transformation Recommendations from Exploration"))
61
+ cells.append(self.cb.code(f'''transform_recommendations = {transform_recs}
62
+ print(f"Found transformation recommendations for {{len(transform_recommendations)}} columns")'''))
63
+
64
+ cells.extend([
65
+ self.cb.section("Initialize Transformer Manager"),
66
+ self.cb.code(f'''numeric_cols = {numeric_cols}
67
+ categorical_cols = {categorical_cols}
68
+
69
+ # TransformerManager ensures consistent transformations between training and scoring
70
+ transformer_manager = TransformerManager(scaler_type="standard")'''),
71
+ self.cb.section("Fit and Transform Features"),
72
+ self.cb.code('''# Fit transformers and transform data in one step
73
+ # Exclude identifier and target columns from transformation
74
+ exclude_cols = ["customer_id", "target"] # Adjust based on your data
75
+ df = transformer_manager.fit_transform(
76
+ df,
77
+ numeric_columns=numeric_cols,
78
+ categorical_columns=categorical_cols,
79
+ exclude_columns=exclude_cols
80
+ )
81
+
82
+ # Log transformation statistics
83
+ manifest = transformer_manager.manifest
84
+ transform_stats["numeric_cols_scaled"] = len(manifest.numeric_columns)
85
+ transform_stats["categorical_cols_encoded"] = len(manifest.categorical_columns)
86
+
87
+ mlflow_adapter.log_params({
88
+ "scaler_type": manifest.scaler_type,
89
+ "encoder_type": manifest.encoder_type,
90
+ "scaled_columns": str(manifest.numeric_columns)[:250],
91
+ "encoded_columns": str(manifest.categorical_columns)[:250],
92
+ })
93
+ print(f"Scaled {len(manifest.numeric_columns)} numeric columns")
94
+ print(f"Encoded {len(manifest.categorical_columns)} categorical columns")'''),
95
+ self.cb.section("Save Transformers as Artifacts"),
96
+ self.cb.code('''# Save transformers locally and to MLflow
97
+ transformer_manager.save("./experiments/data/transformers/transformers.joblib")
98
+
99
+ # Log to MLflow for scoring pipeline to retrieve
100
+ import mlflow
101
+ transformer_manager.log_to_mlflow(run_id=mlflow.active_run().info.run_id)
102
+ print("Transformers saved locally and logged to MLflow")
103
+ print("Scoring pipeline will use these same transformers for consistency")'''),
104
+ self.cb.section("Log Transformation Statistics"),
105
+ self.cb.code('''mlflow_adapter.log_metrics({
106
+ "output_rows": df.shape[0],
107
+ "output_columns": df.shape[1],
108
+ **{k: v for k, v in transform_stats.items() if isinstance(v, (int, float))}
109
+ })
110
+ print(f"Logged {len(transform_stats)} transformation statistics")'''),
111
+ self.cb.section("Save Transformed Data"),
112
+ self.cb.code('''storage.write(df, "./experiments/data/silver/customers_transformed")
113
+ mlflow_adapter.end_run()
114
+ print(f"Transformed data saved: {df.shape}")'''),
115
+ ])
116
+ return cells
117
+
118
+ def generate_databricks_cells(self) -> List[nbformat.NotebookNode]:
119
+ catalog = self.config.feature_store.catalog
120
+ schema = self.config.feature_store.schema
121
+ exp_name = self.config.mlflow.experiment_name
122
+ numeric_cols = self.get_numeric_columns()
123
+ categorical_cols = self.get_categorical_columns()
124
+ return self.header_cells() + [
125
+ self.cb.section("Setup MLflow Tracking"),
126
+ self.cb.code(f'''import mlflow
127
+
128
+ mlflow.set_experiment("/Users/{{spark.conf.get('spark.databricks.notebook.username', 'default')}}/{exp_name}")
129
+ mlflow.start_run(run_name="04_transformation")'''),
130
+ self.cb.section("Load Silver Data"),
131
+ self.cb.code(f'''df = spark.table("{catalog}.{schema}.silver_customers")
132
+ input_count = df.count()
133
+ mlflow.log_metric("input_rows", input_count)'''),
134
+ self.cb.section("Scale Numeric Features"),
135
+ self.cb.code(f'''from pyspark.ml.feature import StandardScaler, VectorAssembler
136
+
137
+ numeric_cols = {numeric_cols}
138
+ if numeric_cols:
139
+ assembler = VectorAssembler(inputCols=numeric_cols, outputCol="numeric_features")
140
+ df = assembler.transform(df)
141
+ scaler = StandardScaler(inputCol="numeric_features", outputCol="scaled_features", withStd=True, withMean=True)
142
+ scaler_model = scaler.fit(df)
143
+ df = scaler_model.transform(df)
144
+ mlflow.log_params({{"scaler_type": "standard", "scaled_columns_count": len(numeric_cols)}})
145
+ print("Numeric features scaled")'''),
146
+ self.cb.section("Encode Categorical Features"),
147
+ self.cb.code(f'''from pyspark.ml.feature import StringIndexer
148
+
149
+ categorical_cols = {categorical_cols}
150
+ for col_name in categorical_cols:
151
+ indexer = StringIndexer(inputCol=col_name, outputCol=f"{{col_name}}_idx")
152
+ df = indexer.fit(df).transform(df)
153
+ mlflow.log_params({{"encoder_type": "string_indexer", "encoded_columns_count": len(categorical_cols)}})
154
+ print(f"Encoded {{len(categorical_cols)}} categorical columns")'''),
155
+ self.cb.section("Log Statistics"),
156
+ self.cb.code('''output_count = df.count()
157
+ mlflow.log_metrics({
158
+ "output_rows": output_count,
159
+ "columns_after_transform": len(df.columns),
160
+ })'''),
161
+ self.cb.section("Save Transformed Data"),
162
+ self.cb.code(f'''df.write.format("delta").mode("overwrite").saveAsTable("{catalog}.{schema}.silver_transformed")
163
+ mlflow.end_run()
164
+ print("Transformed data saved")'''),
165
+ ]