churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,1179 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "d428a219",
6
+ "metadata": {
7
+ "papermill": {
8
+ "duration": 0.002775,
9
+ "end_time": "2026-02-02T13:04:00.422352",
10
+ "exception": false,
11
+ "start_time": "2026-02-02T13:04:00.419577",
12
+ "status": "completed"
13
+ },
14
+ "tags": []
15
+ },
16
+ "source": [
17
+ "# Chapter 10: Pipeline Generation\n",
18
+ "\n",
19
+ "Generate production-ready pipeline code from exploration findings.\n",
20
+ "\n",
21
+ "**Generation Targets:**\n",
22
+ "1. **Local (Feast + MLFlow)** - Local feature store and experiment tracking\n",
23
+ "2. **Databricks (FS + MLFlow)** - Unity Catalog, DLT, Feature Store, MLFlow\n",
24
+ "3. **LLM Documentation** - Markdown files for AI-assisted development\n",
25
+ "\n",
26
+ "**Output Formats:**\n",
27
+ "- Python files (`.py`)\n",
28
+ "- Jupyter notebooks (`.ipynb`)\n",
29
+ "\n",
30
+ "---"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "markdown",
35
+ "id": "0b596fe5",
36
+ "metadata": {
37
+ "papermill": {
38
+ "duration": 0.002122,
39
+ "end_time": "2026-02-02T13:04:00.426926",
40
+ "exception": false,
41
+ "start_time": "2026-02-02T13:04:00.424804",
42
+ "status": "completed"
43
+ },
44
+ "tags": []
45
+ },
46
+ "source": [
47
+ "## 10.1 Configuration"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": null,
53
+ "id": "b805976d",
54
+ "metadata": {
55
+ "execution": {
56
+ "iopub.execute_input": "2026-02-02T13:04:00.431842Z",
57
+ "iopub.status.busy": "2026-02-02T13:04:00.431710Z",
58
+ "iopub.status.idle": "2026-02-02T13:04:01.642128Z",
59
+ "shell.execute_reply": "2026-02-02T13:04:01.640760Z"
60
+ },
61
+ "papermill": {
62
+ "duration": 1.213861,
63
+ "end_time": "2026-02-02T13:04:01.643055",
64
+ "exception": false,
65
+ "start_time": "2026-02-02T13:04:00.429194",
66
+ "status": "completed"
67
+ },
68
+ "tags": []
69
+ },
70
+ "outputs": [],
71
+ "source": [
72
+ "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
73
+ "track_and_export_previous(\"10_spec_generation.ipynb\")\n",
74
+ "\n",
75
+ "from pathlib import Path\n",
76
+ "from enum import Enum\n",
77
+ "\n",
78
+ "class GenerationTarget(Enum):\n",
79
+ " LOCAL_FEAST_MLFLOW = \"local\"\n",
80
+ " DATABRICKS = \"databricks\"\n",
81
+ " LLM_DOCS = \"llm_docs\"\n",
82
+ "\n",
83
+ "class OutputFormat(Enum):\n",
84
+ " PYTHON = \"py\"\n",
85
+ " NOTEBOOK = \"ipynb\"\n",
86
+ "\n",
87
+ "# === USER CONFIGURATION ===\n",
88
+ "PIPELINE_NAME = \"customer_churn\"\n",
89
+ "GENERATION_TARGET = GenerationTarget.LOCAL_FEAST_MLFLOW\n",
90
+ "OUTPUT_FORMAT = OutputFormat.PYTHON\n",
91
+ "\n",
92
+ "# Paths\n",
93
+ "# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
94
+ "OUTPUT_BASE_DIR = Path(\"../generated_pipelines\")\n",
95
+ "\n",
96
+ "# Databricks settings (only used when GENERATION_TARGET == DATABRICKS)\n",
97
+ "DATABRICKS_CATALOG = \"main\"\n",
98
+ "DATABRICKS_SCHEMA = \"ml_features\"\n",
99
+ "\n",
100
+ "print(f\"Pipeline: {PIPELINE_NAME}\")\n",
101
+ "print(f\"Target: {GENERATION_TARGET.value}\")\n",
102
+ "print(f\"Format: {OUTPUT_FORMAT.value}\")\n",
103
+ "from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "markdown",
108
+ "id": "2ebe3689",
109
+ "metadata": {
110
+ "papermill": {
111
+ "duration": 0.001682,
112
+ "end_time": "2026-02-02T13:04:01.646970",
113
+ "exception": false,
114
+ "start_time": "2026-02-02T13:04:01.645288",
115
+ "status": "completed"
116
+ },
117
+ "tags": []
118
+ },
119
+ "source": [
120
+ "## 10.2 Load Findings and Recommendations"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "id": "1ae682f2",
127
+ "metadata": {
128
+ "execution": {
129
+ "iopub.execute_input": "2026-02-02T13:04:01.651834Z",
130
+ "iopub.status.busy": "2026-02-02T13:04:01.651705Z",
131
+ "iopub.status.idle": "2026-02-02T13:04:02.436391Z",
132
+ "shell.execute_reply": "2026-02-02T13:04:02.435891Z"
133
+ },
134
+ "papermill": {
135
+ "duration": 0.788618,
136
+ "end_time": "2026-02-02T13:04:02.437274",
137
+ "exception": false,
138
+ "start_time": "2026-02-02T13:04:01.648656",
139
+ "status": "completed"
140
+ },
141
+ "tags": []
142
+ },
143
+ "outputs": [],
144
+ "source": [
145
+ "import yaml\n",
146
+ "from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
147
+ "from customer_retention.analysis.auto_explorer.layered_recommendations import RecommendationRegistry\n",
148
+ "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n",
149
+ "\n",
150
+ "def load_findings_and_recommendations(findings_dir: Path):\n",
151
+ " findings_files = sorted(\n",
152
+ " [f for f in findings_dir.glob(\"*_findings.yaml\") if \"multi_dataset\" not in f.name],\n",
153
+ " key=lambda f: f.stat().st_mtime, reverse=True\n",
154
+ " )\n",
155
+ " if not findings_files:\n",
156
+ " raise FileNotFoundError(f\"No findings in {findings_dir}. Run exploration notebooks first.\")\n",
157
+ " \n",
158
+ " findings = ExplorationFindings.load(str(findings_files[0]))\n",
159
+ " \n",
160
+ " # Look for recommendations file matching the findings file pattern\n",
161
+ " # Step 06 saves as: {name}_recommendations.yaml (matching {name}_findings.yaml)\n",
162
+ " findings_name = findings_files[0].stem.replace(\"_findings\", \"\")\n",
163
+ " recommendations_path = findings_dir / f\"{findings_name}_recommendations.yaml\"\n",
164
+ " \n",
165
+ " # Fallback to generic recommendations.yaml if not found\n",
166
+ " if not recommendations_path.exists():\n",
167
+ " recommendations_path = findings_dir / \"recommendations.yaml\"\n",
168
+ " \n",
169
+ " # Final fallback: find any *_recommendations.yaml\n",
170
+ " if not recommendations_path.exists():\n",
171
+ " rec_files = sorted(findings_dir.glob(\"*_recommendations.yaml\"), \n",
172
+ " key=lambda f: f.stat().st_mtime, reverse=True)\n",
173
+ " if rec_files:\n",
174
+ " recommendations_path = rec_files[0]\n",
175
+ " \n",
176
+ " registry = None\n",
177
+ " if recommendations_path.exists():\n",
178
+ " with open(recommendations_path) as f:\n",
179
+ " registry = RecommendationRegistry.from_dict(yaml.safe_load(f))\n",
180
+ " print(f\"Loaded recommendations from: {recommendations_path.name}\")\n",
181
+ " \n",
182
+ " multi_dataset_path = findings_dir / \"multi_dataset_findings.yaml\"\n",
183
+ " multi_dataset = None\n",
184
+ " if multi_dataset_path.exists():\n",
185
+ " with open(multi_dataset_path) as f:\n",
186
+ " multi_dataset = yaml.safe_load(f)\n",
187
+ " \n",
188
+ " return findings, registry, multi_dataset\n",
189
+ "\n",
190
+ "findings, registry, multi_dataset = load_findings_and_recommendations(FINDINGS_DIR)\n",
191
+ "\n",
192
+ "print(f\"Loaded: {findings.source_path}\")\n",
193
+ "print(f\"Rows: {findings.row_count:,} | Columns: {findings.column_count}\")\n",
194
+ "print(f\"Target: {findings.target_column}\")\n",
195
+ "print(f\"Recommendations: {'Loaded' if registry else 'Not found'}\")\n",
196
+ "print(f\"Multi-dataset: {'Loaded' if multi_dataset else 'Not found'}\")"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "markdown",
201
+ "id": "7ad0f92a",
202
+ "metadata": {
203
+ "papermill": {
204
+ "duration": 0.001807,
205
+ "end_time": "2026-02-02T13:04:02.441262",
206
+ "exception": false,
207
+ "start_time": "2026-02-02T13:04:02.439455",
208
+ "status": "completed"
209
+ },
210
+ "tags": []
211
+ },
212
+ "source": [
213
+ "## 10.3 Review Layered Recommendations\n",
214
+ "\n",
215
+ "Recommendations are organized by medallion layer:\n",
216
+ "- **Bronze**: null_handling, outlier_handling, type_conversions, deduplication, filtering, text_processing\n",
217
+ "- **Silver**: joins, aggregations, derived_columns\n",
218
+ "- **Gold**: encoding, scaling, feature_selection, transformations"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": null,
224
+ "id": "2dd7c21d",
225
+ "metadata": {
226
+ "execution": {
227
+ "iopub.execute_input": "2026-02-02T13:04:02.445669Z",
228
+ "iopub.status.busy": "2026-02-02T13:04:02.445471Z",
229
+ "iopub.status.idle": "2026-02-02T13:04:02.448183Z",
230
+ "shell.execute_reply": "2026-02-02T13:04:02.447765Z"
231
+ },
232
+ "papermill": {
233
+ "duration": 0.005594,
234
+ "end_time": "2026-02-02T13:04:02.448608",
235
+ "exception": false,
236
+ "start_time": "2026-02-02T13:04:02.443014",
237
+ "status": "completed"
238
+ },
239
+ "tags": []
240
+ },
241
+ "outputs": [],
242
+ "source": [
243
+ "def display_recommendations(registry: RecommendationRegistry):\n",
244
+ " if not registry:\n",
245
+ " print(\"No recommendations loaded. Run notebooks 02-07 first.\")\n",
246
+ " return\n",
247
+ " \n",
248
+ " for layer in [\"bronze\", \"silver\", \"gold\"]:\n",
249
+ " recs = registry.get_by_layer(layer)\n",
250
+ " print(f\"\\n{layer.upper()} ({len(recs)} recommendations):\")\n",
251
+ " print(\"-\" * 50)\n",
252
+ " for rec in recs[:5]:\n",
253
+ " print(f\" [{rec.category}] {rec.target_column}: {rec.action}\")\n",
254
+ " if len(recs) > 5:\n",
255
+ " print(f\" ... and {len(recs) - 5} more\")\n",
256
+ "\n",
257
+ "display_recommendations(registry)"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "markdown",
262
+ "id": "d2939644",
263
+ "metadata": {
264
+ "papermill": {
265
+ "duration": 0.001881,
266
+ "end_time": "2026-02-02T13:04:02.452541",
267
+ "exception": false,
268
+ "start_time": "2026-02-02T13:04:02.450660",
269
+ "status": "completed"
270
+ },
271
+ "tags": []
272
+ },
273
+ "source": [
274
+ "---\n",
275
+ "\n",
276
+ "## 10.4 Generate Pipeline\n",
277
+ "\n",
278
+ "Select generation based on configured target."
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": null,
284
+ "id": "2b072a27",
285
+ "metadata": {
286
+ "execution": {
287
+ "iopub.execute_input": "2026-02-02T13:04:02.456785Z",
288
+ "iopub.status.busy": "2026-02-02T13:04:02.456694Z",
289
+ "iopub.status.idle": "2026-02-02T13:04:02.458991Z",
290
+ "shell.execute_reply": "2026-02-02T13:04:02.458559Z"
291
+ },
292
+ "papermill": {
293
+ "duration": 0.005117,
294
+ "end_time": "2026-02-02T13:04:02.459466",
295
+ "exception": false,
296
+ "start_time": "2026-02-02T13:04:02.454349",
297
+ "status": "completed"
298
+ },
299
+ "tags": []
300
+ },
301
+ "outputs": [],
302
+ "source": [
303
+ "import os\n",
304
+ "\n",
305
+ "output_dir = OUTPUT_BASE_DIR / GENERATION_TARGET.value / PIPELINE_NAME\n",
306
+ "output_dir.mkdir(parents=True, exist_ok=True)\n",
307
+ "\n",
308
+ "print(f\"Output directory: {output_dir}\")"
309
+ ]
310
+ },
311
+ {
312
+ "cell_type": "markdown",
313
+ "id": "ebf9479d",
314
+ "metadata": {
315
+ "papermill": {
316
+ "duration": 0.00201,
317
+ "end_time": "2026-02-02T13:04:02.463438",
318
+ "exception": false,
319
+ "start_time": "2026-02-02T13:04:02.461428",
320
+ "status": "completed"
321
+ },
322
+ "tags": []
323
+ },
324
+ "source": [
325
+ "### Option A: Local (Feast + MLFlow)"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": null,
331
+ "id": "cdac3474",
332
+ "metadata": {
333
+ "execution": {
334
+ "iopub.execute_input": "2026-02-02T13:04:02.468115Z",
335
+ "iopub.status.busy": "2026-02-02T13:04:02.468014Z",
336
+ "iopub.status.idle": "2026-02-02T13:04:02.852402Z",
337
+ "shell.execute_reply": "2026-02-02T13:04:02.851928Z"
338
+ },
339
+ "papermill": {
340
+ "duration": 0.387702,
341
+ "end_time": "2026-02-02T13:04:02.852919",
342
+ "exception": false,
343
+ "start_time": "2026-02-02T13:04:02.465217",
344
+ "status": "completed"
345
+ },
346
+ "tags": []
347
+ },
348
+ "outputs": [],
349
+ "source": [
350
+ "if GENERATION_TARGET == GenerationTarget.LOCAL_FEAST_MLFLOW:\n",
351
+ " from customer_retention.generators.spec_generator import MLflowPipelineGenerator, MLflowConfig\n",
352
+ " from customer_retention.generators.pipeline_generator import PipelineGenerator\n",
353
+ " \n",
354
+ " mlflow_config = MLflowConfig(\n",
355
+ " tracking_uri=\"./mlruns\",\n",
356
+ " experiment_name=PIPELINE_NAME,\n",
357
+ " log_data_quality=True,\n",
358
+ " nested_runs=True\n",
359
+ " )\n",
360
+ " \n",
361
+ " mlflow_gen = MLflowPipelineGenerator(mlflow_config=mlflow_config, output_dir=str(output_dir))\n",
362
+ " \n",
363
+ " if OUTPUT_FORMAT == OutputFormat.PYTHON:\n",
364
+ " saved = mlflow_gen.save_all(findings)\n",
365
+ " print(\"Generated MLflow pipeline files:\")\n",
366
+ " for f in saved:\n",
367
+ " print(f\" {f}\")\n",
368
+ " \n",
369
+ " pipeline_gen = PipelineGenerator(\n",
370
+ " findings_dir=str(FINDINGS_DIR),\n",
371
+ " output_dir=str(output_dir),\n",
372
+ " pipeline_name=PIPELINE_NAME,\n",
373
+ " experiments_dir=str(EXPERIMENTS_DIR),\n",
374
+ " )\n",
375
+ " orch_files = pipeline_gen.generate()\n",
376
+ " print(\"\\nGenerated pipeline files (Bronze/Silver/Gold/Training):\")\n",
377
+ " for f in orch_files:\n",
378
+ " print(f\" {f}\")\n",
379
+ "else:\n",
380
+ " print(f\"Skipping Local generation (target is {GENERATION_TARGET.value})\")"
381
+ ]
382
+ },
383
+ {
384
+ "cell_type": "markdown",
385
+ "id": "ad9a65a4",
386
+ "metadata": {
387
+ "papermill": {
388
+ "duration": 0.00197,
389
+ "end_time": "2026-02-02T13:04:02.857183",
390
+ "exception": false,
391
+ "start_time": "2026-02-02T13:04:02.855213",
392
+ "status": "completed"
393
+ },
394
+ "tags": []
395
+ },
396
+ "source": [
397
+ "### Option B: Databricks (FS + MLFlow)"
398
+ ]
399
+ },
400
+ {
401
+ "cell_type": "code",
402
+ "execution_count": null,
403
+ "id": "49d55480",
404
+ "metadata": {
405
+ "execution": {
406
+ "iopub.execute_input": "2026-02-02T13:04:02.861918Z",
407
+ "iopub.status.busy": "2026-02-02T13:04:02.861818Z",
408
+ "iopub.status.idle": "2026-02-02T13:04:02.864347Z",
409
+ "shell.execute_reply": "2026-02-02T13:04:02.864006Z"
410
+ },
411
+ "papermill": {
412
+ "duration": 0.005833,
413
+ "end_time": "2026-02-02T13:04:02.864872",
414
+ "exception": false,
415
+ "start_time": "2026-02-02T13:04:02.859039",
416
+ "status": "completed"
417
+ },
418
+ "tags": []
419
+ },
420
+ "outputs": [],
421
+ "source": [
422
+ "if GENERATION_TARGET == GenerationTarget.DATABRICKS:\n",
423
+ " from customer_retention.generators.spec_generator import DatabricksSpecGenerator, PipelineSpec, SourceSpec\n",
424
+ " \n",
425
+ " spec = PipelineSpec(\n",
426
+ " name=PIPELINE_NAME,\n",
427
+ " version=\"1.0.0\",\n",
428
+ " sources=[SourceSpec(\n",
429
+ " name=findings.source_path.split(\"/\")[-1].replace(\".csv\", \"\"),\n",
430
+ " path=findings.source_path,\n",
431
+ " format=findings.source_format\n",
432
+ " )]\n",
433
+ " )\n",
434
+ " \n",
435
+ " if findings.target_column:\n",
436
+ " from customer_retention.generators.spec_generator import ModelSpec\n",
437
+ " spec.model_config = ModelSpec(\n",
438
+ " name=f\"{PIPELINE_NAME}_model\",\n",
439
+ " model_type=\"gradient_boosting\",\n",
440
+ " target_column=findings.target_column\n",
441
+ " )\n",
442
+ " \n",
443
+ " db_gen = DatabricksSpecGenerator(\n",
444
+ " catalog=DATABRICKS_CATALOG,\n",
445
+ " schema=DATABRICKS_SCHEMA,\n",
446
+ " output_dir=str(output_dir)\n",
447
+ " )\n",
448
+ " \n",
449
+ " saved = db_gen.save_all(spec)\n",
450
+ " print(\"Generated Databricks artifacts:\")\n",
451
+ " for f in saved:\n",
452
+ " print(f\" {f}\")\n",
453
+ "else:\n",
454
+ " print(f\"Skipping Databricks generation (target is {GENERATION_TARGET.value})\")"
455
+ ]
456
+ },
457
+ {
458
+ "cell_type": "markdown",
459
+ "id": "1ed22b7d",
460
+ "metadata": {
461
+ "papermill": {
462
+ "duration": 0.001823,
463
+ "end_time": "2026-02-02T13:04:02.868650",
464
+ "exception": false,
465
+ "start_time": "2026-02-02T13:04:02.866827",
466
+ "status": "completed"
467
+ },
468
+ "tags": []
469
+ },
470
+ "source": [
471
+ "### Option C: LLM Documentation"
472
+ ]
473
+ },
474
+ {
475
+ "cell_type": "code",
476
+ "execution_count": null,
477
+ "id": "fffd0b3a",
478
+ "metadata": {
479
+ "execution": {
480
+ "iopub.execute_input": "2026-02-02T13:04:02.873141Z",
481
+ "iopub.status.busy": "2026-02-02T13:04:02.873046Z",
482
+ "iopub.status.idle": "2026-02-02T13:04:02.880054Z",
483
+ "shell.execute_reply": "2026-02-02T13:04:02.879602Z"
484
+ },
485
+ "papermill": {
486
+ "duration": 0.010095,
487
+ "end_time": "2026-02-02T13:04:02.880545",
488
+ "exception": false,
489
+ "start_time": "2026-02-02T13:04:02.870450",
490
+ "status": "completed"
491
+ },
492
+ "tags": []
493
+ },
494
+ "outputs": [],
495
+ "source": [
496
+ "if GENERATION_TARGET == GenerationTarget.LLM_DOCS:\n",
497
+ " from customer_retention.analysis.auto_explorer import RecommendationEngine\n",
498
+ " \n",
499
+ " recommender = RecommendationEngine()\n",
500
+ " target_rec = recommender.recommend_target(findings)\n",
501
+ " feature_recs = recommender.recommend_features(findings)\n",
502
+ " cleaning_recs = recommender.recommend_cleaning(findings)\n",
503
+ " \n",
504
+ " docs_dir = output_dir / \"docs\"\n",
505
+ " docs_dir.mkdir(parents=True, exist_ok=True)\n",
506
+ " \n",
507
+ " # 1. Overview\n",
508
+ " overview = f\"\"\"# {PIPELINE_NAME} Pipeline Overview\n",
509
+ "\n",
510
+ "## Data Source\n",
511
+ "- **Path**: {findings.source_path}\n",
512
+ "- **Format**: {findings.source_format}\n",
513
+ "- **Rows**: {findings.row_count:,}\n",
514
+ "- **Columns**: {findings.column_count}\n",
515
+ "- **Quality Score**: {findings.overall_quality_score:.1f}/100\n",
516
+ "\n",
517
+ "## Target Variable\n",
518
+ "- **Column**: {target_rec.column_name}\n",
519
+ "- **Type**: {target_rec.target_type}\n",
520
+ "- **Rationale**: {target_rec.rationale}\n",
521
+ "\n",
522
+ "## Column Types\n",
523
+ "| Column | Type | Nulls | Unique |\n",
524
+ "|--------|------|-------|--------|\n",
525
+ "\"\"\"\n",
526
+ " for name, col in list(findings.columns.items())[:20]:\n",
527
+ " overview += f\"| {name} | {col.inferred_type.value} | {col.null_percentage:.1f}% | {col.unique_count} |\\n\"\n",
528
+ " (docs_dir / \"01_overview.md\").write_text(overview)\n",
529
+ " \n",
530
+ " # 2. Bronze layer - separate file per source\n",
531
+ " if registry and registry.sources:\n",
532
+ " for source_name, bronze_recs in registry.sources.items():\n",
533
+ " bronze_doc = f\"\"\"# Bronze Layer - {source_name}\n",
534
+ "\n",
535
+ "## Source File\n",
536
+ "`{bronze_recs.source_file}`\n",
537
+ "\n",
538
+ "## Null Handling\n",
539
+ "\"\"\"\n",
540
+ " for rec in bronze_recs.null_handling:\n",
541
+ " bronze_doc += f\"- `{rec.target_column}`: {rec.action} ({rec.parameters.get('strategy', '')}) - {rec.rationale}\\n\"\n",
542
+ " \n",
543
+ " bronze_doc += \"\\n## Outlier Handling\\n\"\n",
544
+ " for rec in bronze_recs.outlier_handling:\n",
545
+ " bronze_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.rationale}\\n\"\n",
546
+ " \n",
547
+ " bronze_doc += \"\\n## Type Conversions\\n\"\n",
548
+ " for rec in bronze_recs.type_conversions:\n",
549
+ " bronze_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.rationale}\\n\"\n",
550
+ " \n",
551
+ " bronze_doc += \"\\n## Deduplication\\n\"\n",
552
+ " for rec in bronze_recs.deduplication:\n",
553
+ " bronze_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.rationale}\\n\"\n",
554
+ " \n",
555
+ " bronze_doc += \"\\n## Filtering\\n\"\n",
556
+ " for rec in bronze_recs.filtering:\n",
557
+ " bronze_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.rationale}\\n\"\n",
558
+ " \n",
559
+ " bronze_doc += \"\\n## Text Processing\\n\"\n",
560
+ " for rec in bronze_recs.text_processing:\n",
561
+ " bronze_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.rationale}\\n\"\n",
562
+ " \n",
563
+ " safe_name = source_name.replace(\" \", \"_\").lower()\n",
564
+ " (docs_dir / f\"02_bronze_cleaning_{safe_name}.md\").write_text(bronze_doc)\n",
565
+ " else:\n",
566
+ " bronze_doc = f\"\"\"# Bronze Layer - Data Cleaning\n",
567
+ "\n",
568
+ "## Cleaning Recommendations\n",
569
+ "\"\"\"\n",
570
+ " for rec in cleaning_recs:\n",
571
+ " bronze_doc += f\"\\n### {rec.column_name}\\n- **Strategy**: {rec.strategy}\\n- **Severity**: {rec.severity}\\n- **Rationale**: {rec.rationale}\\n\"\n",
572
+ " (docs_dir / \"02_bronze_cleaning.md\").write_text(bronze_doc)\n",
573
+ " \n",
574
+ " # 3. Silver layer\n",
575
+ " silver_doc = \"\"\"# Silver Layer - Feature Engineering\n",
576
+ "\n",
577
+ "## Aggregations and Joins\n",
578
+ "\"\"\"\n",
579
+ " if registry and registry.silver:\n",
580
+ " silver_doc += \"\\n### Joins\\n\"\n",
581
+ " for rec in registry.silver.joins:\n",
582
+ " silver_doc += f\"- {rec.parameters.get('left_source', '')} ⟷ {rec.parameters.get('right_source', '')} on `{rec.parameters.get('join_keys', [])}`\\n\"\n",
583
+ " \n",
584
+ " silver_doc += \"\\n### Aggregations\\n\"\n",
585
+ " for rec in registry.silver.aggregations:\n",
586
+ " silver_doc += f\"- `{rec.target_column}`: {rec.action} - windows: {rec.parameters.get('windows', [])}\\n\"\n",
587
+ " \n",
588
+ " silver_doc += \"\\n### Derived Columns\\n\"\n",
589
+ " for rec in registry.silver.derived_columns:\n",
590
+ " silver_doc += f\"- `{rec.target_column}`: {rec.parameters.get('expression', rec.action)}\\n\"\n",
591
+ " else:\n",
592
+ " silver_doc += \"\\nNo silver-layer recommendations found.\\n\"\n",
593
+ " (docs_dir / \"03_silver_features.md\").write_text(silver_doc)\n",
594
+ " \n",
595
+ " # 4. Gold layer\n",
596
+ " gold_doc = \"\"\"# Gold Layer - ML Features\n",
597
+ "\n",
598
+ "## Feature Recommendations\n",
599
+ "\"\"\"\n",
600
+ " for rec in feature_recs[:15]:\n",
601
+ " gold_doc += f\"\\n### {rec.feature_name}\\n- **Source**: {rec.source_column}\\n- **Type**: {rec.feature_type}\\n- **Description**: {rec.description}\\n\"\n",
602
+ " \n",
603
+ " if registry and registry.gold:\n",
604
+ " gold_doc += \"\\n## Encoding\\n\"\n",
605
+ " for rec in registry.gold.encoding:\n",
606
+ " gold_doc += f\"- `{rec.target_column}`: {rec.parameters.get('method', rec.action)}\\n\"\n",
607
+ " \n",
608
+ " gold_doc += \"\\n## Scaling\\n\"\n",
609
+ " for rec in registry.gold.scaling:\n",
610
+ " gold_doc += f\"- `{rec.target_column}`: {rec.parameters.get('method', rec.action)}\\n\"\n",
611
+ " \n",
612
+ " gold_doc += \"\\n## Feature Selection\\n\"\n",
613
+ " for rec in registry.gold.feature_selection:\n",
614
+ " gold_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.rationale}\\n\"\n",
615
+ " \n",
616
+ " gold_doc += \"\\n## Transformations\\n\"\n",
617
+ " for rec in registry.gold.transformations:\n",
618
+ " gold_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.parameters}\\n\"\n",
619
+ " (docs_dir / \"04_gold_ml_features.md\").write_text(gold_doc)\n",
620
+ " \n",
621
+ " # 5. Training\n",
622
+ " training_doc = f\"\"\"# Model Training\n",
623
+ "\n",
624
+ "## Target\n",
625
+ "- **Column**: {target_rec.column_name}\n",
626
+ "- **Type**: {target_rec.target_type}\n",
627
+ "\n",
628
+ "## Recommended Models\n",
629
+ "1. **Gradient Boosting** - Good for tabular data with mixed types\n",
630
+ "2. **Random Forest** - Robust baseline, handles missing values\n",
631
+ "3. **Logistic Regression** - Interpretable, good for imbalanced data\n",
632
+ "\n",
633
+ "## Evaluation Metrics\n",
634
+ "- ROC-AUC (primary)\n",
635
+ "- Precision/Recall at threshold\n",
636
+ "- F1 Score\n",
637
+ "\"\"\"\n",
638
+ " (docs_dir / \"05_training.md\").write_text(training_doc)\n",
639
+ " \n",
640
+ " print(\"Generated LLM documentation:\")\n",
641
+ " for f in sorted(docs_dir.glob(\"*.md\")):\n",
642
+ " print(f\" {f.name}\")\n",
643
+ "else:\n",
644
+ " print(f\"Skipping LLM docs generation (target is {GENERATION_TARGET.value})\")"
645
+ ]
646
+ },
647
+ {
648
+ "cell_type": "markdown",
649
+ "id": "09cfe62c",
650
+ "metadata": {
651
+ "papermill": {
652
+ "duration": 0.001929,
653
+ "end_time": "2026-02-02T13:04:02.884785",
654
+ "exception": false,
655
+ "start_time": "2026-02-02T13:04:02.882856",
656
+ "status": "completed"
657
+ },
658
+ "tags": []
659
+ },
660
+ "source": [
661
+ "---\n",
662
+ "\n",
663
+ "## 10.5 Convert to Notebooks (Optional)"
664
+ ]
665
+ },
666
+ {
667
+ "cell_type": "code",
668
+ "execution_count": null,
669
+ "id": "8a006314",
670
+ "metadata": {
671
+ "execution": {
672
+ "iopub.execute_input": "2026-02-02T13:04:02.890553Z",
673
+ "iopub.status.busy": "2026-02-02T13:04:02.890435Z",
674
+ "iopub.status.idle": "2026-02-02T13:04:02.893885Z",
675
+ "shell.execute_reply": "2026-02-02T13:04:02.893497Z"
676
+ },
677
+ "papermill": {
678
+ "duration": 0.007273,
679
+ "end_time": "2026-02-02T13:04:02.894388",
680
+ "exception": false,
681
+ "start_time": "2026-02-02T13:04:02.887115",
682
+ "status": "completed"
683
+ },
684
+ "tags": []
685
+ },
686
+ "outputs": [],
687
+ "source": [
688
+ "import json\n",
689
+ "\n",
690
+ "def py_to_notebook(py_path: Path):\n",
691
+ " content = py_path.read_text()\n",
692
+ " cells = []\n",
693
+ " current_lines = []\n",
694
+ " \n",
695
+ " for line in content.split(\"\\n\"):\n",
696
+ " if line.startswith(\"# %% \") or line.startswith(\"# %%\\n\"):\n",
697
+ " if current_lines:\n",
698
+ " cells.append({\"cell_type\": \"code\", \"metadata\": {}, \"source\": current_lines, \"outputs\": [], \"execution_count\": None})\n",
699
+ " current_lines = []\n",
700
+ " title = line.replace(\"# %% \", \"\").strip()\n",
701
+ " if title:\n",
702
+ " cells.append({\"cell_type\": \"markdown\", \"metadata\": {}, \"source\": [f\"## {title}\"]})\n",
703
+ " else:\n",
704
+ " current_lines.append(line + \"\\n\")\n",
705
+ " \n",
706
+ " if current_lines:\n",
707
+ " cells.append({\"cell_type\": \"code\", \"metadata\": {}, \"source\": current_lines, \"outputs\": [], \"execution_count\": None})\n",
708
+ " \n",
709
+ " notebook = {\n",
710
+ " \"cells\": cells,\n",
711
+ " \"metadata\": {\"kernelspec\": {\"display_name\": \"Python 3\", \"language\": \"python\", \"name\": \"python3\"}},\n",
712
+ " \"nbformat\": 4, \"nbformat_minor\": 4\n",
713
+ " }\n",
714
+ " \n",
715
+ " out_path = py_path.with_suffix(\".ipynb\")\n",
716
+ " out_path.write_text(json.dumps(notebook, indent=1))\n",
717
+ " return out_path\n",
718
+ "\n",
719
+ "if OUTPUT_FORMAT == OutputFormat.NOTEBOOK:\n",
720
+ " print(\"Converting Python files to notebooks...\")\n",
721
+ " for py_file in output_dir.rglob(\"*.py\"):\n",
722
+ " if py_file.name != \"__init__.py\":\n",
723
+ " nb_path = py_to_notebook(py_file)\n",
724
+ " print(f\" {py_file.name} -> {nb_path.name}\")\n",
725
+ "else:\n",
726
+ " print(\"Output format is Python. Set OUTPUT_FORMAT = OutputFormat.NOTEBOOK to convert.\")"
727
+ ]
728
+ },
729
+ {
730
+ "cell_type": "markdown",
731
+ "id": "6e53142b",
732
+ "metadata": {
733
+ "papermill": {
734
+ "duration": 0.00193,
735
+ "end_time": "2026-02-02T13:04:02.898619",
736
+ "exception": false,
737
+ "start_time": "2026-02-02T13:04:02.896689",
738
+ "status": "completed"
739
+ },
740
+ "tags": []
741
+ },
742
+ "source": [
743
+ "---\n",
744
+ "\n",
745
+ "## 10.6 Run Pipeline\n",
746
+ "\n",
747
+ "Single command runs everything: Bronze (parallel) β†’ Silver β†’ Gold β†’ Training β†’ MLflow UI (auto-opens browser)."
748
+ ]
749
+ },
750
+ {
751
+ "cell_type": "code",
752
+ "execution_count": null,
753
+ "id": "482c6e06",
754
+ "metadata": {
755
+ "execution": {
756
+ "iopub.execute_input": "2026-02-02T13:04:02.903213Z",
757
+ "iopub.status.busy": "2026-02-02T13:04:02.903112Z",
758
+ "iopub.status.idle": "2026-02-02T13:04:17.357131Z",
759
+ "shell.execute_reply": "2026-02-02T13:04:17.356666Z"
760
+ },
761
+ "papermill": {
762
+ "duration": 14.457404,
763
+ "end_time": "2026-02-02T13:04:17.357888",
764
+ "exception": false,
765
+ "start_time": "2026-02-02T13:04:02.900484",
766
+ "status": "completed"
767
+ },
768
+ "tags": []
769
+ },
770
+ "outputs": [],
771
+ "source": [
772
+ "# Uncomment below to run the pipeline after generation\n",
773
+ "# RUN_PIPELINE = True\n",
774
+ "\n",
775
+ "RUN_PIPELINE = True\n",
776
+ "\n",
777
+ "runner_path = output_dir / \"pipeline_runner.py\"\n",
778
+ "\n",
779
+ "if RUN_PIPELINE and GENERATION_TARGET == GenerationTarget.LOCAL_FEAST_MLFLOW:\n",
780
+ " import subprocess\n",
781
+ " if runner_path.exists():\n",
782
+ " print(f\"Running: python {runner_path.name}\")\n",
783
+ " print(\"Pipeline will run Bronze β†’ Silver β†’ Gold β†’ Training...\")\n",
784
+ " subprocess.run([\"python\", \"pipeline_runner.py\"], cwd=str(output_dir.resolve()))\n",
785
+ " else:\n",
786
+ " print(f\"pipeline_runner.py not found. Generate first by running cells above.\")\n",
787
+ "else:\n",
788
+ " print(\"To run the complete pipeline:\")\n",
789
+ " print(f\"\\n cd {output_dir}\")\n",
790
+ " print(f\" python pipeline_runner.py\")\n",
791
+ " print(f\"\\nThis will:\")\n",
792
+ " print(\" 1. Run Landing layers (event sources)\")\n",
793
+ " print(\" 2. Run Bronze layers (parallel)\")\n",
794
+ " print(\" 3. Run Silver merge\")\n",
795
+ " print(\" 4. Run Gold features\")\n",
796
+ " print(\" 5. Train models with MLflow\")"
797
+ ]
798
+ },
799
+ {
800
+ "cell_type": "markdown",
801
+ "id": "4598ce61",
802
+ "metadata": {
803
+ "papermill": {
804
+ "duration": 0.002325,
805
+ "end_time": "2026-02-02T13:04:17.363457",
806
+ "exception": false,
807
+ "start_time": "2026-02-02T13:04:17.361132",
808
+ "status": "completed"
809
+ },
810
+ "tags": []
811
+ },
812
+ "source": [
813
+ "---\n",
814
+ "\n",
815
+ "## 10.7 Summary"
816
+ ]
817
+ },
818
+ {
819
+ "cell_type": "code",
820
+ "execution_count": null,
821
+ "id": "985dd6db",
822
+ "metadata": {
823
+ "execution": {
824
+ "iopub.execute_input": "2026-02-02T13:04:17.370260Z",
825
+ "iopub.status.busy": "2026-02-02T13:04:17.370117Z",
826
+ "iopub.status.idle": "2026-02-02T13:04:17.375429Z",
827
+ "shell.execute_reply": "2026-02-02T13:04:17.374869Z"
828
+ },
829
+ "papermill": {
830
+ "duration": 0.009411,
831
+ "end_time": "2026-02-02T13:04:17.376048",
832
+ "exception": false,
833
+ "start_time": "2026-02-02T13:04:17.366637",
834
+ "status": "completed"
835
+ },
836
+ "tags": []
837
+ },
838
+ "outputs": [],
839
+ "source": [
840
+ "print(\"Generated Artifacts Summary\")\n",
841
+ "print(\"=\" * 60)\n",
842
+ "print(f\"Pipeline: {PIPELINE_NAME}\")\n",
843
+ "print(f\"Target: {GENERATION_TARGET.value}\")\n",
844
+ "print(f\"Format: {OUTPUT_FORMAT.value}\")\n",
845
+ "print(f\"Output: {output_dir}\")\n",
846
+ "print()\n",
847
+ "\n",
848
+ "def show_tree(path: Path, prefix: str = \"\"):\n",
849
+ " items = sorted(path.iterdir(), key=lambda p: (p.is_file(), p.name))\n",
850
+ " for i, item in enumerate(items):\n",
851
+ " is_last = i == len(items) - 1\n",
852
+ " connector = \"└── \" if is_last else \"β”œβ”€β”€ \"\n",
853
+ " if item.is_file():\n",
854
+ " size = item.stat().st_size\n",
855
+ " print(f\"{prefix}{connector}{item.name} ({size:,} bytes)\")\n",
856
+ " else:\n",
857
+ " print(f\"{prefix}{connector}{item.name}/\")\n",
858
+ " show_tree(item, prefix + (\" \" if is_last else \"β”‚ \"))\n",
859
+ "\n",
860
+ "if output_dir.exists():\n",
861
+ " show_tree(output_dir)"
862
+ ]
863
+ },
864
+ {
865
+ "cell_type": "markdown",
866
+ "id": "1dc48b8c",
867
+ "metadata": {
868
+ "papermill": {
869
+ "duration": 0.002716,
870
+ "end_time": "2026-02-02T13:04:17.382097",
871
+ "exception": false,
872
+ "start_time": "2026-02-02T13:04:17.379381",
873
+ "status": "completed"
874
+ },
875
+ "tags": []
876
+ },
877
+ "source": [
878
+ "---\n",
879
+ "\n",
880
+ "## 10.8 Recommendations Hash\n",
881
+ "\n",
882
+ "The recommendations hash is a unique identifier for the gold layer feature engineering configuration. It enables experiment tracking and reproducibility."
883
+ ]
884
+ },
885
+ {
886
+ "cell_type": "code",
887
+ "execution_count": null,
888
+ "id": "91b4ade3",
889
+ "metadata": {
890
+ "execution": {
891
+ "iopub.execute_input": "2026-02-02T13:04:17.388426Z",
892
+ "iopub.status.busy": "2026-02-02T13:04:17.388317Z",
893
+ "iopub.status.idle": "2026-02-02T13:04:17.393227Z",
894
+ "shell.execute_reply": "2026-02-02T13:04:17.392765Z"
895
+ },
896
+ "papermill": {
897
+ "duration": 0.008643,
898
+ "end_time": "2026-02-02T13:04:17.393664",
899
+ "exception": false,
900
+ "start_time": "2026-02-02T13:04:17.385021",
901
+ "status": "completed"
902
+ },
903
+ "tags": []
904
+ },
905
+ "outputs": [],
906
+ "source": [
907
+ "if registry:\n",
908
+ " recommendations_hash = registry.compute_recommendations_hash()\n",
909
+ " print(\"Recommendations Hash\")\n",
910
+ " print(\"=\" * 60)\n",
911
+ " print(f\"Hash: {recommendations_hash}\")\n",
912
+ " print(f\"Full version tag: v1.0.0_{recommendations_hash}\")\n",
913
+ " print()\n",
914
+ " print(\"This hash uniquely identifies the gold layer configuration:\")\n",
915
+ " print(f\" - Encodings: {len(registry.gold.encoding) if registry.gold else 0}\")\n",
916
+ " print(f\" - Scalings: {len(registry.gold.scaling) if registry.gold else 0}\")\n",
917
+ " print(f\" - Transformations: {len(registry.gold.transformations) if registry.gold else 0}\")\n",
918
+ " print(f\" - Feature selections: {len(registry.gold.feature_selection) if registry.gold else 0}\")\n",
919
+ " \n",
920
+ " # Show what's in each layer for debugging\n",
921
+ " print()\n",
922
+ " print(\"Recommendations by layer:\")\n",
923
+ " for layer in [\"bronze\", \"silver\", \"gold\"]:\n",
924
+ " recs = registry.get_by_layer(layer)\n",
925
+ " print(f\" {layer.upper()}: {len(recs)} recommendations\")\n",
926
+ " if recs and layer == \"gold\":\n",
927
+ " for rec in recs[:3]:\n",
928
+ " print(f\" - [{rec.category}] {rec.target_column}: {rec.action}\")\n",
929
+ " if len(recs) > 3:\n",
930
+ " print(f\" ... and {len(recs) - 3} more\")\n",
931
+ " \n",
932
+ " # Check if gold layer exists but is empty\n",
933
+ " if registry.gold:\n",
934
+ " print(f\"\\nβœ“ Gold layer initialized (target: {registry.gold.target_column})\")\n",
935
+ " else:\n",
936
+ " print(\"\\n⚠ Gold layer not initialized - run step 06 first\")\n",
937
+ " \n",
938
+ " print()\n",
939
+ " print(\"Use this hash to:\")\n",
940
+ " print(\" - Track MLflow experiments (tag: recommendations_hash)\")\n",
941
+ " print(\" - Version Feast feature views (tag in feature_store)\")\n",
942
+ " print(\" - Return to a specific feature engineering configuration\")\n",
943
+ "else:\n",
944
+ " print(\"No recommendations loaded - hash not available\")\n",
945
+ " print(\"Run notebooks 02-07 first, then re-run this notebook.\")"
946
+ ]
947
+ },
948
+ {
949
+ "cell_type": "markdown",
950
+ "id": "caef46fa",
951
+ "metadata": {
952
+ "papermill": {
953
+ "duration": 0.002355,
954
+ "end_time": "2026-02-02T13:04:17.399020",
955
+ "exception": false,
956
+ "start_time": "2026-02-02T13:04:17.396665",
957
+ "status": "completed"
958
+ },
959
+ "tags": []
960
+ },
961
+ "source": [
962
+ "---\n",
963
+ "\n",
964
+ "## 10.9 Feast Feature Store Validation\n",
965
+ "\n",
966
+ "Check what's registered in Feast after running the pipeline."
967
+ ]
968
+ },
969
+ {
970
+ "cell_type": "code",
971
+ "execution_count": null,
972
+ "id": "cd13afb0",
973
+ "metadata": {
974
+ "execution": {
975
+ "iopub.execute_input": "2026-02-02T13:04:17.405266Z",
976
+ "iopub.status.busy": "2026-02-02T13:04:17.405169Z",
977
+ "iopub.status.idle": "2026-02-02T13:04:18.933093Z",
978
+ "shell.execute_reply": "2026-02-02T13:04:18.932518Z"
979
+ },
980
+ "papermill": {
981
+ "duration": 1.531922,
982
+ "end_time": "2026-02-02T13:04:18.933926",
983
+ "exception": false,
984
+ "start_time": "2026-02-02T13:04:17.402004",
985
+ "status": "completed"
986
+ },
987
+ "tags": []
988
+ },
989
+ "outputs": [],
990
+ "source": [
991
+ "# Inspect Feast Feature Store contents\n",
992
+ "import warnings\n",
993
+ "warnings.filterwarnings(\"ignore\", category=DeprecationWarning, module=\"feast\")\n",
994
+ "\n",
995
+ "feast_repo_path = output_dir / \"feature_repo\"\n",
996
+ "\n",
997
+ "if feast_repo_path.exists() and (feast_repo_path / \"feature_store.yaml\").exists():\n",
998
+ " try:\n",
999
+ " from feast import FeatureStore\n",
1000
+ " store = FeatureStore(repo_path=str(feast_repo_path))\n",
1001
+ " \n",
1002
+ " print(\"Feast Feature Store Contents\")\n",
1003
+ " print(\"=\" * 60)\n",
1004
+ " \n",
1005
+ " # List entities\n",
1006
+ " entities = store.list_entities()\n",
1007
+ " feature_views = store.list_feature_views()\n",
1008
+ " data_sources = store.list_data_sources()\n",
1009
+ " \n",
1010
+ " # Check if registry is empty (feast apply not run yet)\n",
1011
+ " if not entities and not feature_views:\n",
1012
+ " print(\"\\n⚠️ Feature store registry is empty.\")\n",
1013
+ " print(\" The feature definitions exist but haven't been applied yet.\")\n",
1014
+ " print(\"\\n To register features, run:\")\n",
1015
+ " print(f\" cd {feast_repo_path}\")\n",
1016
+ " print(\" feast apply\")\n",
1017
+ " print(\"\\n Or run the full pipeline:\")\n",
1018
+ " print(f\" cd {output_dir}\")\n",
1019
+ " print(\" python run_all.py\")\n",
1020
+ " else:\n",
1021
+ " print(f\"\\nπŸ“¦ Entities ({len(entities)}):\")\n",
1022
+ " for entity in entities:\n",
1023
+ " print(f\" - {entity.name} (join_key: {entity.join_keys})\")\n",
1024
+ " \n",
1025
+ " print(f\"\\nπŸ“Š Feature Views ({len(feature_views)}):\")\n",
1026
+ " for fv in feature_views:\n",
1027
+ " print(f\" - {fv.name}: {len(fv.features)} features\")\n",
1028
+ " for feat in fv.features[:5]:\n",
1029
+ " print(f\" β€’ {feat.name} ({feat.dtype})\")\n",
1030
+ " if len(fv.features) > 5:\n",
1031
+ " print(f\" ... and {len(fv.features) - 5} more\")\n",
1032
+ " \n",
1033
+ " print(f\"\\nπŸ’Ύ Data Sources ({len(data_sources)}):\")\n",
1034
+ " for ds in data_sources:\n",
1035
+ " print(f\" - {ds.name}\")\n",
1036
+ " \n",
1037
+ " # Try to show sample data from parquet files\n",
1038
+ " print(f\"\\nπŸ“„ Sample Feature Data:\")\n",
1039
+ " data_dir = feast_repo_path / \"data\"\n",
1040
+ " if data_dir.exists():\n",
1041
+ " parquet_files = list(data_dir.glob(\"*.parquet\"))\n",
1042
+ " if parquet_files:\n",
1043
+ " sample_df = pd.read_parquet(parquet_files[0])\n",
1044
+ " print(f\" Source: {parquet_files[0].name}\")\n",
1045
+ " print(f\" Shape: {sample_df.shape[0]:,} rows x {sample_df.shape[1]} columns\")\n",
1046
+ " print(f\"\\n Head (first 5 rows):\")\n",
1047
+ " display(sample_df.head())\n",
1048
+ " else:\n",
1049
+ " print(\" No parquet files found yet in data/ directory.\")\n",
1050
+ " print(\" Features will be materialized when you run the pipeline.\")\n",
1051
+ " else:\n",
1052
+ " print(\" Data directory not created yet.\")\n",
1053
+ " \n",
1054
+ " except ImportError:\n",
1055
+ " print(\"Feast not installed. Install with: pip install feast\")\n",
1056
+ " except Exception as e:\n",
1057
+ " print(f\"Could not connect to Feast: {e}\")\n",
1058
+ " print(\"\\nTo manually inspect, run:\")\n",
1059
+ " print(f\" cd {feast_repo_path}\")\n",
1060
+ " print(\" feast apply\")\n",
1061
+ " print(\" feast feature-views list\")\n",
1062
+ "else:\n",
1063
+ " print(f\"Feature repo not found at: {feast_repo_path}\")\n",
1064
+ " print(\"Generate the pipeline first by running cells above.\")"
1065
+ ]
1066
+ },
1067
+ {
1068
+ "cell_type": "markdown",
1069
+ "id": "d0ccb535",
1070
+ "metadata": {
1071
+ "papermill": {
1072
+ "duration": 0.00305,
1073
+ "end_time": "2026-02-02T13:04:18.939946",
1074
+ "exception": false,
1075
+ "start_time": "2026-02-02T13:04:18.936896",
1076
+ "status": "completed"
1077
+ },
1078
+ "tags": []
1079
+ },
1080
+ "source": [
1081
+ "---\n",
1082
+ "\n",
1083
+ "## 10.10 Next Steps\n",
1084
+ "\n",
1085
+ "### Run Pipeline (Single Command)\n",
1086
+ "```bash\n",
1087
+ "cd ../generated_pipelines/local/customer_churn\n",
1088
+ "python run_all.py\n",
1089
+ "```\n",
1090
+ "\n",
1091
+ "This single command:\n",
1092
+ "1. Runs Bronze layers in **parallel**\n",
1093
+ "2. Runs Silver merge\n",
1094
+ "3. Runs Gold features \n",
1095
+ "4. Trains models with MLflow tracking\n",
1096
+ "5. **Auto-starts MLflow UI** and opens browser\n",
1097
+ "6. Press `Ctrl+C` to stop when done\n",
1098
+ "\n",
1099
+ "### Generated Structure\n",
1100
+ "```\n",
1101
+ "generated_pipelines/local/{pipeline}/\n",
1102
+ "β”œβ”€β”€ run_all.py # Single entry point\n",
1103
+ "β”œβ”€β”€ config.py # Configuration (includes RECOMMENDATIONS_HASH)\n",
1104
+ "β”œβ”€β”€ bronze/\n",
1105
+ "β”‚ └── bronze_*.py # Parallel execution\n",
1106
+ "β”œβ”€β”€ silver/\n",
1107
+ "β”‚ └── silver_merge.py\n",
1108
+ "β”œβ”€β”€ gold/\n",
1109
+ "β”‚ └── gold_features.py # Includes feature version tag\n",
1110
+ "β”œβ”€β”€ training/\n",
1111
+ "β”‚ └── ml_experiment.py # MLflow tags with recommendations_hash\n",
1112
+ "β”œβ”€β”€ pipeline.py # Standalone pipeline script\n",
1113
+ "└── requirements.txt\n",
1114
+ "```\n",
1115
+ "\n",
1116
+ "### Tracking Your Experiment\n",
1117
+ "After running, you can find your experiment by:\n",
1118
+ "- **MLflow UI**: Filter by tag `recommendations_hash = <your_hash>`\n",
1119
+ "- **Feast**: Check feature view tags for `recommendations_hash`\n",
1120
+ "- **Return to config**: The hash uniquely identifies the gold layer settings\n",
1121
+ "\n",
1122
+ "---\n",
1123
+ "\n",
1124
+ "## Complete!"
1125
+ ]
1126
+ },
1127
+ {
1128
+ "cell_type": "markdown",
1129
+ "id": "53e908ce",
1130
+ "metadata": {
1131
+ "papermill": {
1132
+ "duration": 0.002918,
1133
+ "end_time": "2026-02-02T13:04:18.946314",
1134
+ "exception": false,
1135
+ "start_time": "2026-02-02T13:04:18.943396",
1136
+ "status": "completed"
1137
+ },
1138
+ "tags": []
1139
+ },
1140
+ "source": [
1141
+ "> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
1142
+ "> The next notebook will automatically export this notebook's HTML documentation from the saved file."
1143
+ ]
1144
+ }
1145
+ ],
1146
+ "metadata": {
1147
+ "kernelspec": {
1148
+ "display_name": "Python 3",
1149
+ "language": "python",
1150
+ "name": "python3"
1151
+ },
1152
+ "language_info": {
1153
+ "codemirror_mode": {
1154
+ "name": "ipython",
1155
+ "version": 3
1156
+ },
1157
+ "file_extension": ".py",
1158
+ "mimetype": "text/x-python",
1159
+ "name": "python",
1160
+ "nbconvert_exporter": "python",
1161
+ "pygments_lexer": "ipython3",
1162
+ "version": "3.12.4"
1163
+ },
1164
+ "papermill": {
1165
+ "default_parameters": {},
1166
+ "duration": 19.869551,
1167
+ "end_time": "2026-02-02T13:04:19.468369",
1168
+ "environment_variables": {},
1169
+ "exception": null,
1170
+ "input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/10_spec_generation.ipynb",
1171
+ "output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/10_spec_generation.ipynb",
1172
+ "parameters": {},
1173
+ "start_time": "2026-02-02T13:03:59.598818",
1174
+ "version": "2.6.0"
1175
+ }
1176
+ },
1177
+ "nbformat": 4,
1178
+ "nbformat_minor": 5
1179
+ }