churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,2125 @@
1
+ from collections import OrderedDict, namedtuple
2
+ from pathlib import Path
3
+ from typing import List, Tuple
4
+
5
+ from jinja2 import BaseLoader, Environment
6
+
7
+ from .models import (
8
+ BronzeEventConfig,
9
+ BronzeLayerConfig,
10
+ LandingLayerConfig,
11
+ PipelineConfig,
12
+ PipelineTransformationType,
13
+ TransformationStep,
14
+ )
15
+
16
+ SECTION_MAP = {
17
+ PipelineTransformationType.IMPUTE_NULL: "Missing Value Analysis",
18
+ PipelineTransformationType.DROP_COLUMN: "Missing Value Analysis",
19
+ PipelineTransformationType.CAP_OUTLIER: "Global Outlier Detection",
20
+ PipelineTransformationType.WINSORIZE: "Global Outlier Detection",
21
+ PipelineTransformationType.SEGMENT_AWARE_CAP: "Segment-Aware Outlier Analysis",
22
+ PipelineTransformationType.LOG_TRANSFORM: "Feature Distributions",
23
+ PipelineTransformationType.SQRT_TRANSFORM: "Feature Distributions",
24
+ PipelineTransformationType.YEO_JOHNSON: "Feature Distributions",
25
+ PipelineTransformationType.CAP_THEN_LOG: "Feature Distributions",
26
+ PipelineTransformationType.ZERO_INFLATION_HANDLING: "Feature Distributions",
27
+ PipelineTransformationType.ENCODE: "Categorical Feature Analysis",
28
+ PipelineTransformationType.SCALE: "Feature-Target Correlations",
29
+ PipelineTransformationType.FEATURE_SELECT: "Feature Selection Recommendations",
30
+ PipelineTransformationType.DERIVED_COLUMN: "Feature Engineering Recommendations",
31
+ PipelineTransformationType.TYPE_CAST: "Data Consistency Checks",
32
+ }
33
+
34
+ ANCHOR_MAP = {
35
+ PipelineTransformationType.IMPUTE_NULL: "3.5-Missing-Value-Analysis",
36
+ PipelineTransformationType.DROP_COLUMN: "3.5-Missing-Value-Analysis",
37
+ PipelineTransformationType.CAP_OUTLIER: "3.8-Global-Outlier-Detection",
38
+ PipelineTransformationType.WINSORIZE: "3.8-Global-Outlier-Detection",
39
+ PipelineTransformationType.SEGMENT_AWARE_CAP: "3.7-Segment-Aware-Outlier-Analysis",
40
+ PipelineTransformationType.LOG_TRANSFORM: "4.4-Feature-Distributions-by-Retention-Status",
41
+ PipelineTransformationType.SQRT_TRANSFORM: "4.4-Feature-Distributions-by-Retention-Status",
42
+ PipelineTransformationType.YEO_JOHNSON: "4.4-Feature-Distributions-by-Retention-Status",
43
+ PipelineTransformationType.CAP_THEN_LOG: "4.4-Feature-Distributions-by-Retention-Status",
44
+ PipelineTransformationType.ZERO_INFLATION_HANDLING: "4.4-Feature-Distributions-by-Retention-Status",
45
+ PipelineTransformationType.ENCODE: "4.6-Categorical-Feature-Analysis",
46
+ PipelineTransformationType.SCALE: "4.5-Feature-Target-Correlations",
47
+ PipelineTransformationType.FEATURE_SELECT: "4.9.1-Feature-Selection-Recommendations",
48
+ PipelineTransformationType.DERIVED_COLUMN: "4.9.4-Feature-Engineering-Recommendations",
49
+ PipelineTransformationType.TYPE_CAST: "3.11-Data-Consistency-Checks",
50
+ }
51
+
52
+ DEFAULT_NOTEBOOK_MAP = {
53
+ PipelineTransformationType.IMPUTE_NULL: "03_quality_assessment",
54
+ PipelineTransformationType.DROP_COLUMN: "03_quality_assessment",
55
+ PipelineTransformationType.CAP_OUTLIER: "03_quality_assessment",
56
+ PipelineTransformationType.WINSORIZE: "03_quality_assessment",
57
+ PipelineTransformationType.SEGMENT_AWARE_CAP: "03_quality_assessment",
58
+ PipelineTransformationType.TYPE_CAST: "03_quality_assessment",
59
+ PipelineTransformationType.LOG_TRANSFORM: "04_relationship_analysis",
60
+ PipelineTransformationType.SQRT_TRANSFORM: "04_relationship_analysis",
61
+ PipelineTransformationType.YEO_JOHNSON: "04_relationship_analysis",
62
+ PipelineTransformationType.CAP_THEN_LOG: "04_relationship_analysis",
63
+ PipelineTransformationType.ZERO_INFLATION_HANDLING: "04_relationship_analysis",
64
+ PipelineTransformationType.ENCODE: "04_relationship_analysis",
65
+ PipelineTransformationType.SCALE: "04_relationship_analysis",
66
+ PipelineTransformationType.FEATURE_SELECT: "04_relationship_analysis",
67
+ PipelineTransformationType.DERIVED_COLUMN: "04_relationship_analysis",
68
+ }
69
+
70
+
71
+ _docs_base: str = "docs"
72
+
73
+
74
+ def _notebook_title(notebook: str) -> str:
75
+ name = notebook.split("_", 1)[1] if "_" in notebook else notebook
76
+ return name.replace("_", " ").title()
77
+
78
+
79
+ def provenance_docstring(step: TransformationStep) -> str:
80
+ notebook = step.source_notebook or DEFAULT_NOTEBOOK_MAP.get(step.type)
81
+ if not notebook:
82
+ return ""
83
+ title = _notebook_title(notebook)
84
+ anchor = ANCHOR_MAP.get(step.type)
85
+ section = SECTION_MAP.get(step.type)
86
+ base = _docs_base
87
+ if anchor:
88
+ return f"{title} {section}\n {base}/{notebook}.html#{anchor}"
89
+ return f"{title}\n {base}/{notebook}.html"
90
+
91
+
92
+ def provenance_docstring_block(steps) -> str:
93
+ seen = set()
94
+ entries = []
95
+ for step in steps:
96
+ key = provenance_key(step)
97
+ if not key or key in seen:
98
+ continue
99
+ seen.add(key)
100
+ entry = provenance_docstring(step)
101
+ if entry:
102
+ entries.append(entry)
103
+ if not entries:
104
+ return ""
105
+ body = "\n ".join(entries)
106
+ return f' """\n {body}\n """'
107
+
108
+
109
+ def provenance_key(step: TransformationStep) -> str:
110
+ notebook = step.source_notebook or DEFAULT_NOTEBOOK_MAP.get(step.type)
111
+ section = SECTION_MAP.get(step.type, "")
112
+ return f"{notebook}:{section}" if notebook else ""
113
+
114
+
115
+ class StepGrouper:
116
+
117
+ _TYPE_TO_FUNC = {
118
+ PipelineTransformationType.DROP_COLUMN: "drop_unusable_columns",
119
+ PipelineTransformationType.IMPUTE_NULL: "impute_remaining_nulls",
120
+ PipelineTransformationType.CAP_OUTLIER: "cap_outliers",
121
+ PipelineTransformationType.TYPE_CAST: "apply_type_casts",
122
+ PipelineTransformationType.WINSORIZE: "winsorize_outliers",
123
+ PipelineTransformationType.SEGMENT_AWARE_CAP: "cap_segment_aware_outliers",
124
+ PipelineTransformationType.LOG_TRANSFORM: "apply_log_transforms",
125
+ PipelineTransformationType.SQRT_TRANSFORM: "apply_sqrt_transforms",
126
+ PipelineTransformationType.ZERO_INFLATION_HANDLING: "handle_zero_inflation",
127
+ PipelineTransformationType.CAP_THEN_LOG: "apply_cap_then_log_transforms",
128
+ PipelineTransformationType.YEO_JOHNSON: "apply_power_transforms",
129
+ PipelineTransformationType.FEATURE_SELECT: "apply_feature_selection",
130
+ }
131
+
132
+ _DERIVED_ACTION_TO_FUNC = {
133
+ "ratio": "create_ratio_features",
134
+ "interaction": "create_interaction_features",
135
+ "composite": "create_composite_features",
136
+ }
137
+
138
+ @classmethod
139
+ def group(cls, steps: List[TransformationStep]) -> List[Tuple[str, List[TransformationStep]]]:
140
+ if not steps:
141
+ return []
142
+ groups: OrderedDict[str, List[TransformationStep]] = OrderedDict()
143
+ for step in steps:
144
+ groups.setdefault(cls._func_name(step), []).append(step)
145
+ return list(groups.items())
146
+
147
+ @classmethod
148
+ def _func_name(cls, step: TransformationStep) -> str:
149
+ if step.type == PipelineTransformationType.DERIVED_COLUMN:
150
+ action = step.parameters.get("action", "ratio")
151
+ return cls._DERIVED_ACTION_TO_FUNC.get(action, f"create_{action}_features")
152
+ return cls._TYPE_TO_FUNC.get(step.type, f"apply_{step.type.value}")
153
+
154
+
155
+ group_steps = StepGrouper.group
156
+
157
+
158
+ class InlineLoader(BaseLoader):
159
+ def __init__(self, templates: dict):
160
+ self._templates = templates
161
+
162
+ def get_source(self, environment, template):
163
+ if template in self._templates:
164
+ return self._templates[template], template, lambda: True
165
+ raise Exception(f"Template {template} not found")
166
+
167
+
168
+ TEMPLATES = {
169
+ "config.py.j2": """import os
170
+ from pathlib import Path
171
+
172
+ PIPELINE_NAME = "{{ config.name }}"
173
+ TARGET_COLUMN = "{{ config.target_column }}"
174
+ OUTPUT_DIR = Path("{{ config.output_dir }}")
175
+
176
+ # Iteration tracking
177
+ ITERATION_ID = {{ '"%s"' % config.iteration_id if config.iteration_id else 'None' }}
178
+ PARENT_ITERATION_ID = {{ '"%s"' % config.parent_iteration_id if config.parent_iteration_id else 'None' }}
179
+
180
+ # Recommendations hash for experiment tracking
181
+ RECOMMENDATIONS_HASH = {{ '"%s"' % config.recommendations_hash if config.recommendations_hash else 'None' }}
182
+
183
+
184
+ def _find_project_root():
185
+ path = Path(__file__).parent
186
+ for _ in range(10):
187
+ if (path / "pyproject.toml").exists() or (path / ".git").exists():
188
+ return path
189
+ path = path.parent
190
+ return Path(__file__).parent
191
+
192
+
193
+ PROJECT_ROOT = _find_project_root()
194
+
195
+ # Experiments directory - all artifacts (data, mlruns, feast) go here
196
+ # Override with CR_EXPERIMENTS_DIR environment variable for Databricks/custom locations
197
+ _default_experiments = {{ '"%s"' % config.experiments_dir if config.experiments_dir else '"experiments"' }}
198
+ EXPERIMENTS_DIR = Path(os.environ.get("CR_EXPERIMENTS_DIR", str(PROJECT_ROOT / _default_experiments)))
199
+
200
+ # Documentation base URL for provenance links in generated code
201
+ # Local: file:// URI to HTML docs (from export_tutorial_html.py)
202
+ # Databricks: set to workspace notebook path for exploration report
203
+ DOCS_BASE_URL = os.environ.get("CR_DOCS_BASE_URL", str(EXPERIMENTS_DIR / "docs"))
204
+
205
+ # Production output directory - all pipeline writes go here
206
+ # Override with CR_PRODUCTION_DIR environment variable
207
+ _default_production = {{ '"%s"' % config.production_dir if config.production_dir else 'str(EXPERIMENTS_DIR)' }}
208
+ PRODUCTION_DIR = Path(os.environ.get("CR_PRODUCTION_DIR", _default_production))
209
+
210
+ # MLflow tracking - using SQLite backend (recommended over deprecated file-based backend)
211
+ MLFLOW_TRACKING_URI = os.environ.get("MLFLOW_TRACKING_URI", f"sqlite:///{EXPERIMENTS_DIR / 'mlruns.db'}")
212
+ MLFLOW_ARTIFACT_ROOT = str(EXPERIMENTS_DIR / "mlruns" / "artifacts")
213
+
214
+ # Feast feature store configuration - stored in experiments directory
215
+ FEAST_REPO_PATH = str(PRODUCTION_DIR / "feature_repo")
216
+ FEAST_FEATURE_VIEW = "{{ config.feast.feature_view_name if config.feast else config.name + '_features' }}"
217
+ FEAST_ENTITY_NAME = "{{ config.feast.entity_name if config.feast else 'customer' }}"
218
+ FEAST_ENTITY_KEY = "{{ config.feast.entity_key if config.feast else config.sources[0].entity_key }}"
219
+ FEAST_TIMESTAMP_COL = "{{ config.feast.timestamp_column if config.feast else 'event_timestamp' }}"
220
+ FEAST_TTL_DAYS = {{ config.feast.ttl_days if config.feast else 365 }}
221
+
222
+ # Source paths - findings directory is a subfolder of experiments
223
+ FINDINGS_DIR = EXPERIMENTS_DIR / "findings"
224
+
225
+ SOURCES = {
226
+ {% for source in config.sources %}
227
+ "{{ source.name }}": {
228
+ "path": str(FINDINGS_DIR / "{{ source.path }}"),
229
+ "format": "{{ source.format }}",
230
+ "entity_key": "{{ source.entity_key }}",
231
+ {% if source.time_column %}
232
+ "time_column": "{{ source.time_column }}",
233
+ {% endif %}
234
+ "is_event_level": {{ source.is_event_level }},
235
+ },
236
+ {% endfor %}
237
+ }
238
+
239
+
240
+ def get_bronze_path(source_name: str) -> Path:
241
+ return PRODUCTION_DIR / "data" / "bronze" / f"{source_name}.parquet"
242
+
243
+
244
+ def get_silver_path() -> Path:
245
+ return PRODUCTION_DIR / "data" / "silver" / "merged.parquet"
246
+
247
+
248
+ def get_gold_path() -> Path:
249
+ return PRODUCTION_DIR / "data" / "gold" / "features.parquet"
250
+
251
+
252
+ def get_feast_data_path() -> Path:
253
+ return Path(FEAST_REPO_PATH) / "data" / f"{FEAST_FEATURE_VIEW}.parquet"
254
+
255
+
256
+ # Fit mode configuration for training vs scoring separation
257
+ FIT_MODE = {{ 'True' if config.fit_mode else 'False' }}
258
+ ARTIFACTS_PATH = {{ '"%s"' % config.artifacts_path if config.artifacts_path else 'str(PRODUCTION_DIR / "artifacts" / (RECOMMENDATIONS_HASH or "default"))' }}
259
+
260
+ RAW_SOURCES = {
261
+ {% for name, landing in config.landing.items() %}
262
+ "{{ name }}": {
263
+ "path": "{{ landing.raw_source_path }}",
264
+ "format": "{{ landing.raw_source_format }}",
265
+ "entity_key": "{{ landing.entity_column }}",
266
+ "time_column": "{{ landing.time_column }}",
267
+ },
268
+ {% endfor %}
269
+ }
270
+
271
+ EXCLUDED_SOURCES = [
272
+ {% for source in config.sources %}
273
+ {% if source.excluded %}
274
+ "{{ source.name }}",
275
+ {% endif %}
276
+ {% endfor %}
277
+ ]
278
+
279
+ EXPLORATION_ARTIFACTS = {
280
+ "bronze": {name: str(EXPERIMENTS_DIR / "data" / "bronze" / f"{name}.parquet") for name in SOURCES},
281
+ "silver": str(EXPERIMENTS_DIR / "data" / "silver" / "merged.parquet"),
282
+ "gold": str(EXPERIMENTS_DIR / "data" / "gold" / "features.parquet"),
283
+ "scoring": str(EXPERIMENTS_DIR / "data" / "scoring" / "predictions.parquet"),
284
+ }
285
+ """,
286
+ "bronze.py.j2": """import pandas as pd
287
+ import numpy as np
288
+ from pathlib import Path
289
+ {% set ops, fitted = collect_imports(config.transformations, False) %}
290
+ {% if ops %}
291
+ from customer_retention.transforms import {{ ops | sort | join(', ') }}
292
+ {% endif %}
293
+ from config import SOURCES, get_bronze_path{{ ', RAW_SOURCES' if config.lifecycle else '' }}
294
+
295
+ SOURCE_NAME = "{{ source }}"
296
+
297
+
298
+ def load_{{ source }}():
299
+ source_config = SOURCES[SOURCE_NAME]
300
+ path = Path(source_config["path"])
301
+ if path.is_dir() and (path / "_delta_log").is_dir():
302
+ from customer_retention.integrations.adapters.factory import get_delta
303
+ return get_delta(force_local=True).read(str(path))
304
+ if not path.exists():
305
+ raise FileNotFoundError(f"Source file not found: {path}")
306
+ if source_config["format"] == "csv":
307
+ return pd.read_csv(path)
308
+ return pd.read_parquet(path)
309
+
310
+
311
+ {% set groups = group_steps(config.transformations) %}
312
+
313
+ def apply_transformations(df: pd.DataFrame) -> pd.DataFrame:
314
+ {%- if groups %}
315
+ {%- for func_name, steps in groups %}
316
+ df = {{ func_name }}(df)
317
+ {%- endfor %}
318
+ {%- endif %}
319
+ return df
320
+
321
+ {% for func_name, steps in groups %}
322
+
323
+ def {{ func_name }}(df: pd.DataFrame) -> pd.DataFrame:
324
+ {%- set _prov = provenance_docstring_block(steps) %}
325
+ {%- if _prov %}
326
+ {{ _prov }}
327
+ {%- endif %}
328
+ {%- for t in steps %}
329
+ # {{ t.rationale }}
330
+ # {{ action_description(t) }}
331
+ df = {{ render_step_call(t) }}
332
+ {%- endfor %}
333
+ return df
334
+ {% endfor %}
335
+
336
+ {% if config.lifecycle %}
337
+
338
+ # --- Lifecycle enrichment (computed on cleaned data) ---
339
+
340
+ ENTITY_COLUMN = "{{ config.entity_column or config.source.entity_key }}"
341
+ TIME_COLUMN = "{{ config.time_column or config.source.time_column }}"
342
+
343
+
344
+ def _load_raw_events():
345
+ source = RAW_SOURCES[SOURCE_NAME]
346
+ path = Path(source["path"])
347
+ if path.is_dir() and (path / "_delta_log").is_dir():
348
+ from customer_retention.integrations.adapters.factory import get_delta
349
+ return get_delta(force_local=True).read(str(path))
350
+ if not path.exists():
351
+ raise FileNotFoundError(f"Raw source not found: {path}")
352
+ if source["format"] == "csv":
353
+ return pd.read_csv(path)
354
+ return pd.read_parquet(path)
355
+
356
+ {% if config.lifecycle.include_recency_bucket %}
357
+
358
+ def add_recency_tenure(df: pd.DataFrame, raw_df: pd.DataFrame) -> pd.DataFrame:
359
+ raw_df[TIME_COLUMN] = pd.to_datetime(raw_df[TIME_COLUMN])
360
+ reference_date = raw_df[TIME_COLUMN].max()
361
+ entity_stats = raw_df.groupby(ENTITY_COLUMN)[TIME_COLUMN].agg(["min", "max"])
362
+ entity_stats["days_since_last"] = (reference_date - entity_stats["max"]).dt.days
363
+ entity_stats["days_since_first"] = (reference_date - entity_stats["min"]).dt.days
364
+ df = df.merge(entity_stats[["days_since_last", "days_since_first"]], left_on=ENTITY_COLUMN, right_index=True, how="left")
365
+ return df
366
+
367
+
368
+ def add_recency_buckets(df: pd.DataFrame) -> pd.DataFrame:
369
+ if "days_since_last" in df.columns:
370
+ df["recency_bucket"] = pd.cut(df["days_since_last"], bins=[0, 7, 30, 90, 180, 365, float("inf")],
371
+ labels=["0-7d", "7-30d", "30-90d", "90-180d", "180-365d", "365d+"])
372
+ return df
373
+
374
+ {% endif %}
375
+ {% if config.lifecycle.include_lifecycle_quadrant %}
376
+
377
+ def add_lifecycle_quadrant(df: pd.DataFrame) -> pd.DataFrame:
378
+ if "days_since_first" not in df.columns:
379
+ return df
380
+ tenure = df["days_since_first"]
381
+ intensity_col = [c for c in df.columns if c.startswith("event_count_")]
382
+ if not intensity_col:
383
+ return df
384
+ intensity = df[intensity_col[0]]
385
+ tenure_med = tenure.median()
386
+ intensity_med = intensity.median()
387
+ conditions = [
388
+ (tenure >= tenure_med) & (intensity >= intensity_med),
389
+ (tenure >= tenure_med) & (intensity < intensity_med),
390
+ (tenure < tenure_med) & (intensity >= intensity_med),
391
+ (tenure < tenure_med) & (intensity < intensity_med),
392
+ ]
393
+ labels = ["loyal", "at_risk", "new_active", "new_inactive"]
394
+ df["lifecycle_quadrant"] = np.select(conditions, labels, default="unknown")
395
+ return df
396
+
397
+ {% endif %}
398
+ {% if config.lifecycle.include_cyclical_features %}
399
+
400
+ def add_cyclical_features(df: pd.DataFrame, raw_df: pd.DataFrame) -> pd.DataFrame:
401
+ raw_df[TIME_COLUMN] = pd.to_datetime(raw_df[TIME_COLUMN])
402
+ mean_dow = raw_df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.dayofweek.mean())
403
+ df = df.merge(mean_dow.rename("mean_dow"), left_on=ENTITY_COLUMN, right_index=True, how="left")
404
+ df["dow_sin"] = np.sin(2 * np.pi * df["mean_dow"] / 7)
405
+ df["dow_cos"] = np.cos(2 * np.pi * df["mean_dow"] / 7)
406
+ df = df.drop(columns=["mean_dow"], errors="ignore")
407
+ return df
408
+
409
+ {% endif %}
410
+ {% if config.lifecycle.momentum_pairs %}
411
+
412
+ def add_momentum_ratios(df: pd.DataFrame) -> pd.DataFrame:
413
+ {% for pair in config.lifecycle.momentum_pairs %}
414
+ short_col = "event_count_{{ pair.short_window }}"
415
+ long_col = "event_count_{{ pair.long_window }}"
416
+ if short_col in df.columns and long_col in df.columns:
417
+ df["momentum_{{ pair.short_window }}_{{ pair.long_window }}"] = df[short_col] / df[long_col].replace(0, float("nan"))
418
+ {% endfor %}
419
+ return df
420
+
421
+ {% endif %}
422
+
423
+ def enrich_lifecycle(df: pd.DataFrame) -> pd.DataFrame:
424
+ raw_df = _load_raw_events()
425
+ {% if config.raw_time_column %}
426
+ raw_df = raw_df.rename(columns={"{{ config.raw_time_column }}": TIME_COLUMN})
427
+ {% endif %}
428
+ {% if config.lifecycle.include_recency_bucket %}
429
+ df = add_recency_tenure(df, raw_df)
430
+ df = add_recency_buckets(df)
431
+ {% endif %}
432
+ {% if config.lifecycle.include_lifecycle_quadrant %}
433
+ df = add_lifecycle_quadrant(df)
434
+ {% endif %}
435
+ {% if config.lifecycle.include_cyclical_features %}
436
+ df = add_cyclical_features(df, raw_df)
437
+ {% endif %}
438
+ {% if config.lifecycle.momentum_pairs %}
439
+ df = add_momentum_ratios(df)
440
+ {% endif %}
441
+ return df
442
+ {% endif %}
443
+
444
+
445
+ def run_bronze_{{ source }}():
446
+ df = load_{{ source }}()
447
+ df = apply_transformations(df)
448
+ {% if config.lifecycle %}
449
+ df = enrich_lifecycle(df)
450
+ {% endif %}
451
+ output_path = get_bronze_path(SOURCE_NAME)
452
+ output_dir = output_path.parent
453
+ output_dir.mkdir(parents=True, exist_ok=True)
454
+ try:
455
+ from customer_retention.integrations.adapters.factory import get_delta
456
+ storage = get_delta(force_local=True)
457
+ storage.write(df, str(output_dir / SOURCE_NAME))
458
+ except ImportError:
459
+ df.to_parquet(output_path, index=False)
460
+ return df
461
+
462
+
463
+ if __name__ == "__main__":
464
+ run_bronze_{{ source }}()
465
+ """,
466
+ "silver.py.j2": '''import pandas as pd
467
+ {% set ops, fitted = collect_imports(config.silver.derived_columns, False) %}
468
+ {% if ops %}
469
+ from customer_retention.transforms import {{ ops | sort | join(', ') }}
470
+ {% endif %}
471
+ from config import SOURCES, get_bronze_path, get_silver_path, TARGET_COLUMN
472
+
473
+
474
+ def _load_artifact(path):
475
+ from pathlib import Path as _P
476
+ p = _P(path)
477
+ if p.parent.is_dir() and (p.parent / p.stem / "_delta_log").is_dir():
478
+ from customer_retention.integrations.adapters.factory import get_delta
479
+ return get_delta(force_local=True).read(str(p.parent / p.stem))
480
+ return pd.read_parquet(path)
481
+
482
+
483
+ def load_bronze_outputs() -> dict:
484
+ return {name: _load_artifact(get_bronze_path(name))
485
+ for name in SOURCES.keys() if not SOURCES[name].get("excluded")}
486
+
487
+
488
+ def merge_sources(bronze_outputs: dict) -> pd.DataFrame:
489
+ base_source = "{{ config.sources[0].name }}"
490
+ merged = bronze_outputs[base_source]
491
+ {% for join in config.silver.joins %}
492
+ merged = merged.merge(
493
+ bronze_outputs["{{ join.right_source }}"],
494
+ left_on="{{ join.left_key }}",
495
+ right_on="{{ join.right_key }}",
496
+ how="{{ join.how }}"
497
+ )
498
+ {% endfor %}
499
+ return merged
500
+
501
+
502
+ def create_holdout_mask(df: pd.DataFrame, holdout_fraction: float = 0.1, random_state: int = 42) -> pd.DataFrame:
503
+ """Create holdout set by masking target for a fraction of records.
504
+
505
+ IMPORTANT: This must happen in the silver layer (BEFORE gold layer feature computation)
506
+ to prevent temporal leakage. If holdout is created after features are computed,
507
+ the features may contain information derived from the target values that will be masked.
508
+
509
+ Args:
510
+ df: DataFrame with TARGET_COLUMN
511
+ holdout_fraction: Fraction of records to use for holdout (default 10%)
512
+ random_state: Random seed for reproducibility
513
+
514
+ Returns:
515
+ DataFrame with holdout mask applied (original values stored in original_{TARGET_COLUMN})
516
+ """
517
+ ORIGINAL_COLUMN = f"original_{TARGET_COLUMN}"
518
+
519
+ # Skip if holdout already exists
520
+ if ORIGINAL_COLUMN in df.columns:
521
+ print(f" Holdout already exists ({ORIGINAL_COLUMN}), skipping creation")
522
+ return df
523
+
524
+ if TARGET_COLUMN not in df.columns:
525
+ print(f" Warning: TARGET_COLUMN \\'{TARGET_COLUMN}\\' not found, skipping holdout creation")
526
+ return df
527
+
528
+ print(f"Creating holdout set ({holdout_fraction:.0%} of data)...")
529
+ df = df.copy()
530
+
531
+ n_holdout = int(len(df) * holdout_fraction)
532
+ holdout_idx = df.sample(n=n_holdout, random_state=random_state).index
533
+
534
+ # Store original values for holdout records only
535
+ df[ORIGINAL_COLUMN] = pd.NA
536
+ df.loc[holdout_idx, ORIGINAL_COLUMN] = df.loc[holdout_idx, TARGET_COLUMN]
537
+
538
+ # Mask target values for holdout records
539
+ df.loc[holdout_idx, TARGET_COLUMN] = pd.NA
540
+
541
+ print(f" Holdout records: {n_holdout:,} ({holdout_fraction:.0%})")
542
+ print(f" Training records: {len(df) - n_holdout:,} ({1-holdout_fraction:.0%})")
543
+
544
+ return df
545
+
546
+
547
+ {% set derived_groups = group_steps(config.silver.derived_columns) %}
548
+
549
+ def create_derived_columns(df: pd.DataFrame) -> pd.DataFrame:
550
+ {%- if derived_groups %}
551
+ {%- for func_name, steps in derived_groups %}
552
+ df = {{ func_name }}(df)
553
+ {%- endfor %}
554
+ {%- endif %}
555
+ return df
556
+
557
+ {% for func_name, steps in derived_groups %}
558
+
559
+ def {{ func_name }}(df: pd.DataFrame) -> pd.DataFrame:
560
+ {%- set _prov = provenance_docstring_block(steps) %}
561
+ {%- if _prov %}
562
+ {{ _prov }}
563
+ {%- endif %}
564
+ {%- for dc in steps %}
565
+ # {{ dc.rationale }}
566
+ # {{ action_description(dc) }}
567
+ df = {{ render_step_call(dc) }}
568
+ {%- endfor %}
569
+ return df
570
+ {% endfor %}
571
+
572
+
573
+ def run_silver_merge(create_holdout: bool = True, holdout_fraction: float = 0.1):
574
+ bronze_outputs = load_bronze_outputs()
575
+ silver = merge_sources(bronze_outputs)
576
+ silver = create_derived_columns(silver)
577
+
578
+ if create_holdout:
579
+ silver = create_holdout_mask(silver, holdout_fraction=holdout_fraction)
580
+
581
+ output_path = get_silver_path()
582
+ output_dir = output_path.parent
583
+ output_dir.mkdir(parents=True, exist_ok=True)
584
+ try:
585
+ from customer_retention.integrations.adapters.factory import get_delta
586
+ storage = get_delta(force_local=True)
587
+ storage.write(silver, str(output_dir / "silver"))
588
+ except ImportError:
589
+ silver.to_parquet(output_path, index=False)
590
+ return silver
591
+
592
+
593
+ if __name__ == "__main__":
594
+ run_silver_merge()
595
+ ''',
596
+ "gold.py.j2": '''import pandas as pd
597
+ import warnings
598
+ from datetime import datetime
599
+ from pathlib import Path
600
+ {% set all_gold_steps = config.gold.transformations + config.gold.encodings + config.gold.scalings %}
601
+ {% set ops, fitted = collect_imports(all_gold_steps, True) %}
602
+ {% set fs_ops = ['apply_feature_select'] if config.gold.feature_selections else [] %}
603
+ from customer_retention.transforms import ArtifactStore{{ (', ' + (ops | sort | join(', '))) if ops }}{{ (', ' + (fs_ops | join(', '))) if fs_ops and 'apply_feature_select' not in ops }}
604
+ {% if fitted %}
605
+ from customer_retention.transforms.fitted import {{ fitted | sort | join(', ') }}
606
+ {% endif %}
607
+ from config import (get_silver_path, get_gold_path, get_feast_data_path,
608
+ TARGET_COLUMN, RECOMMENDATIONS_HASH, FEAST_REPO_PATH,
609
+ FEAST_FEATURE_VIEW, FEAST_ENTITY_KEY, FEAST_TIMESTAMP_COL, EXPERIMENTS_DIR,
610
+ ARTIFACTS_PATH, FIT_MODE)
611
+
612
+ {% if config.fit_mode %}
613
+ _store = ArtifactStore(Path(ARTIFACTS_PATH))
614
+ {% else %}
615
+ _store = ArtifactStore.from_manifest(Path(ARTIFACTS_PATH) / "manifest.yaml")
616
+ {% endif %}
617
+
618
+ from customer_retention.generators.pipeline_generator.models import (
619
+ PipelineTransformationType,
620
+ TransformationStep,
621
+ )
622
+
623
+ ENCODINGS = [
624
+ {% for enc in config.gold.encodings %}
625
+ TransformationStep(type=PipelineTransformationType.ENCODE, column="{{ enc.column }}", parameters={{ enc.parameters }}, rationale="{{ enc.rationale }}"),
626
+ {% endfor %}
627
+ ]
628
+
629
+ SCALINGS = [
630
+ {% for scale in config.gold.scalings %}
631
+ TransformationStep(type=PipelineTransformationType.SCALE, column="{{ scale.column }}", parameters={{ scale.parameters }}, rationale="{{ scale.rationale }}"),
632
+ {% endfor %}
633
+ ]
634
+
635
+
636
+ def load_silver() -> pd.DataFrame:
637
+ path = get_silver_path()
638
+ parent = path.parent
639
+ delta_path = parent / "silver"
640
+ if delta_path.is_dir() and (delta_path / "_delta_log").is_dir():
641
+ from customer_retention.integrations.adapters.factory import get_delta
642
+ return get_delta(force_local=True).read(str(delta_path))
643
+ return pd.read_parquet(path)
644
+
645
+
646
+ def load_gold() -> pd.DataFrame:
647
+ path = get_gold_path()
648
+ delta_path = path.parent / "gold"
649
+ if delta_path.is_dir() and (delta_path / "_delta_log").is_dir():
650
+ from customer_retention.integrations.adapters.factory import get_delta
651
+ return get_delta(force_local=True).read(str(delta_path))
652
+ return pd.read_parquet(path)
653
+
654
+
655
+ {% set transform_groups = group_steps(config.gold.transformations) %}
656
+
657
+ def apply_gold_transformations(df: pd.DataFrame) -> pd.DataFrame:
658
+ {%- if transform_groups %}
659
+ {%- for func_name, steps in transform_groups %}
660
+ df = {{ func_name }}(df)
661
+ {%- endfor %}
662
+ {%- endif %}
663
+ return df
664
+
665
+ {% for func_name, steps in transform_groups %}
666
+
667
+ def {{ func_name }}(df: pd.DataFrame) -> pd.DataFrame:
668
+ {%- set _prov = provenance_docstring_block(steps) %}
669
+ {%- if _prov %}
670
+ {{ _prov }}
671
+ {%- endif %}
672
+ {%- for t in steps %}
673
+ # {{ t.rationale }}
674
+ # {{ action_description(t) }}
675
+ df = {{ render_step_call(t, config.fit_mode) }}
676
+ {%- endfor %}
677
+ return df
678
+ {% endfor %}
679
+
680
+
681
+ def apply_encodings(df: pd.DataFrame) -> pd.DataFrame:
682
+ {%- set _prov = provenance_docstring_block(config.gold.encodings) %}
683
+ {%- if _prov %}
684
+ {{ _prov }}
685
+ {%- endif %}
686
+ {%- if config.gold.encodings %}
687
+ {%- for enc in config.gold.encodings %}
688
+ # {{ enc.rationale }}
689
+ # {{ action_description(enc) }}
690
+ df = {{ render_step_call(enc, config.fit_mode) }}
691
+ {%- endfor %}
692
+ {%- endif %}
693
+ return df
694
+
695
+
696
+ def apply_scaling(df: pd.DataFrame) -> pd.DataFrame:
697
+ {%- set _prov = provenance_docstring_block(config.gold.scalings) %}
698
+ {%- if _prov %}
699
+ {{ _prov }}
700
+ {%- endif %}
701
+ {%- if config.gold.scalings %}
702
+ {%- for scale in config.gold.scalings %}
703
+ # {{ scale.rationale }}
704
+ # {{ action_description(scale) }}
705
+ df = {{ render_step_call(scale, config.fit_mode) }}
706
+ {%- endfor %}
707
+ {%- endif %}
708
+ return df
709
+
710
+
711
+ def apply_feature_selection(df: pd.DataFrame) -> pd.DataFrame:
712
+ {% if config.gold.feature_selections %}
713
+ {% for fs in config.gold.feature_selections %}
714
+ # Feature selection
715
+ # drop {{ fs }} (feature selection)
716
+ df = apply_feature_select(df, '{{ fs }}')
717
+ {% endfor %}
718
+ {% endif %}
719
+ return df
720
+
721
+
722
+ def get_feature_version_tag() -> str:
723
+ if RECOMMENDATIONS_HASH:
724
+ return f"v1.0.0_{RECOMMENDATIONS_HASH}"
725
+ return "v1.0.0"
726
+
727
+
728
+ def add_feast_timestamp(df: pd.DataFrame, reference_date=None) -> pd.DataFrame:
729
+ if FEAST_TIMESTAMP_COL not in df.columns:
730
+ if "aggregation_reference_date" in df.attrs:
731
+ timestamp = df.attrs["aggregation_reference_date"]
732
+ print(f" Using aggregation reference_date for Feast timestamp: {timestamp}")
733
+ elif reference_date is not None:
734
+ timestamp = reference_date
735
+ print(f" Using provided reference_date for Feast timestamp: {timestamp}")
736
+ else:
737
+ timestamp = datetime.now()
738
+ warnings.warn(
739
+ f"No reference_date available for Feast timestamp. Using datetime.now() ({timestamp}). "
740
+ "This may cause temporal leakage - features should use actual aggregation dates. "
741
+ "Set aggregation_reference_date in DataFrame.attrs during aggregation.",
742
+ UserWarning
743
+ )
744
+ df[FEAST_TIMESTAMP_COL] = timestamp
745
+ return df
746
+
747
+
748
+ def materialize_to_feast(df: pd.DataFrame) -> None:
749
+ feast_path = get_feast_data_path()
750
+ feast_path.parent.mkdir(parents=True, exist_ok=True)
751
+ df_feast = df.copy()
752
+ df_feast = add_feast_timestamp(df_feast)
753
+ original_cols = [c for c in df_feast.columns if c.startswith("original_")]
754
+ if original_cols:
755
+ print(f" Excluding holdout columns from Feast: {original_cols}")
756
+ df_feast = df_feast.drop(columns=original_cols, errors="ignore")
757
+ try:
758
+ from customer_retention.integrations.adapters.factory import get_delta
759
+ storage = get_delta(force_local=True)
760
+ storage.write(df_feast, str(feast_path.parent / feast_path.stem))
761
+ except ImportError:
762
+ df_feast.to_parquet(feast_path, index=False)
763
+ print(f"Features materialized to Feast: {feast_path}")
764
+ print(f" Entity key: {FEAST_ENTITY_KEY}")
765
+ print(f" Feature view: {FEAST_FEATURE_VIEW}")
766
+ print(f" Rows: {len(df_feast):,}")
767
+
768
+
769
+ def run_gold_features():
770
+ silver = load_silver()
771
+ gold = apply_gold_transformations(silver)
772
+ gold = apply_encodings(gold)
773
+ gold = apply_scaling(gold)
774
+ gold = apply_feature_selection(gold)
775
+ {% if config.fit_mode %}
776
+ _store.save_manifest()
777
+ print(f"Fit artifacts saved to: {ARTIFACTS_PATH}")
778
+ {% endif %}
779
+ output_path = get_gold_path()
780
+ output_path.parent.mkdir(parents=True, exist_ok=True)
781
+ gold.attrs["recommendations_hash"] = RECOMMENDATIONS_HASH
782
+ gold.attrs["feature_version"] = get_feature_version_tag()
783
+ try:
784
+ from customer_retention.integrations.adapters.factory import get_delta
785
+ storage = get_delta(force_local=True)
786
+ storage.write(gold, str(output_path.parent / "gold"))
787
+ except ImportError:
788
+ gold.to_parquet(output_path, index=False)
789
+ print(f"Gold features saved with version: {get_feature_version_tag()}")
790
+ materialize_to_feast(gold)
791
+ return gold
792
+
793
+
794
+ if __name__ == "__main__":
795
+ run_gold_features()
796
+ ''',
797
+ "training.py.j2": '''import pandas as pd
798
+ import mlflow
799
+ import mlflow.sklearn
800
+ import mlflow.xgboost
801
+ import xgboost as xgb
802
+ from pathlib import Path
803
+ from feast import FeatureStore
804
+ from sklearn.model_selection import train_test_split, cross_val_score
805
+ from sklearn.ensemble import RandomForestClassifier
806
+ from sklearn.linear_model import LogisticRegression
807
+ from sklearn.preprocessing import LabelEncoder
808
+ from sklearn.metrics import (roc_auc_score, average_precision_score, f1_score,
809
+ precision_score, recall_score, accuracy_score)
810
+ from config import (TARGET_COLUMN, PIPELINE_NAME, RECOMMENDATIONS_HASH, MLFLOW_TRACKING_URI, MLFLOW_ARTIFACT_ROOT,
811
+ FEAST_REPO_PATH, FEAST_FEATURE_VIEW, FEAST_ENTITY_KEY, FEAST_TIMESTAMP_COL,
812
+ get_feast_data_path)
813
+
814
+ # Set tracking URI immediately to prevent default mlruns directory creation
815
+ mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
816
+
817
+
818
+ def _load_feast_data():
819
+ feast_path = get_feast_data_path()
820
+ delta_path = feast_path.parent / feast_path.stem
821
+ if delta_path.is_dir() and (delta_path / "_delta_log").is_dir():
822
+ from customer_retention.integrations.adapters.factory import get_delta
823
+ return get_delta(force_local=True).read(str(delta_path))
824
+ return pd.read_parquet(feast_path)
825
+
826
+
827
+ def get_training_data_from_feast() -> pd.DataFrame:
828
+ """Retrieve training data from Feast for training/serving consistency.
829
+
830
+ Uses get_historical_features for point-in-time correct feature retrieval.
831
+ This ensures training uses the exact same feature retrieval path as inference.
832
+ """
833
+ feast_path = Path(FEAST_REPO_PATH)
834
+
835
+ # Check if Feast repo is initialized
836
+ if not (feast_path / "feature_store.yaml").exists():
837
+ print("Feast repo not initialized, falling back to data file")
838
+ return _load_feast_data()
839
+
840
+ try:
841
+ store = FeatureStore(repo_path=str(feast_path))
842
+
843
+ # Read the materialized features to get entity keys and timestamps
844
+ features_df = _load_feast_data()
845
+
846
+ # Create entity dataframe for historical feature retrieval
847
+ entity_df = features_df[[FEAST_ENTITY_KEY, FEAST_TIMESTAMP_COL]].copy()
848
+
849
+ # Get all feature names (excluding entity key, timestamp, target, and holdout ground truth)
850
+ exclude_cols = {FEAST_ENTITY_KEY, FEAST_TIMESTAMP_COL, TARGET_COLUMN}
851
+ feature_cols = [c for c in features_df.columns
852
+ if c not in exclude_cols and not c.startswith("original_")]
853
+
854
+ # Build feature references
855
+ feature_refs = [f"{FEAST_FEATURE_VIEW}:{col}" for col in feature_cols]
856
+
857
+ print(f"Retrieving {len(feature_refs)} features from Feast...")
858
+ print(f" Feature view: {FEAST_FEATURE_VIEW}")
859
+ print(f" Entity key: {FEAST_ENTITY_KEY}")
860
+
861
+ # Get historical features with point-in-time correctness
862
+ training_df = store.get_historical_features(
863
+ entity_df=entity_df,
864
+ features=feature_refs
865
+ ).to_df()
866
+
867
+ # Add target column back
868
+ training_df = training_df.merge(
869
+ features_df[[FEAST_ENTITY_KEY, TARGET_COLUMN]],
870
+ on=FEAST_ENTITY_KEY,
871
+ how="left"
872
+ )
873
+
874
+ print(f" Retrieved {len(training_df):,} rows, {len(training_df.columns)} columns")
875
+ return training_df
876
+
877
+ except Exception as e:
878
+ print(f"Feast retrieval failed ({e}), falling back to data file")
879
+ return _load_feast_data()
880
+
881
+
882
+ def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
883
+ """Prepare features for model training.
884
+
885
+ Explicitly excludes original_* columns which contain holdout ground truth.
886
+ These columns are reserved for scoring validation and must never be used in training.
887
+ """
888
+ df = df.copy()
889
+
890
+ # Drop Feast metadata columns
891
+ drop_cols = [FEAST_ENTITY_KEY, FEAST_TIMESTAMP_COL]
892
+ df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")
893
+
894
+ # Exclude original_* columns (holdout ground truth - prevents data leakage)
895
+ original_cols = [c for c in df.columns if c.startswith("original_")]
896
+ df = df.drop(columns=original_cols, errors="ignore")
897
+
898
+ # Encode categorical columns
899
+ for col in df.select_dtypes(include=["object", "category"]).columns:
900
+ df[col] = LabelEncoder().fit_transform(df[col].astype(str))
901
+
902
+ return df.select_dtypes(include=["int64", "float64", "int32", "float32"]).fillna(0)
903
+
904
+
905
+ def compute_metrics(y_true, y_proba, y_pred) -> dict:
906
+ return {
907
+ "roc_auc": roc_auc_score(y_true, y_proba),
908
+ "pr_auc": average_precision_score(y_true, y_proba),
909
+ "f1": f1_score(y_true, y_pred),
910
+ "precision": precision_score(y_true, y_pred),
911
+ "recall": recall_score(y_true, y_pred),
912
+ "accuracy": accuracy_score(y_true, y_pred),
913
+ }
914
+
915
+
916
+ def get_feature_importance(model, feature_names) -> pd.DataFrame:
917
+ if hasattr(model, "feature_importances_"):
918
+ importance = model.feature_importances_
919
+ elif hasattr(model, "coef_"):
920
+ importance = abs(model.coef_[0])
921
+ else:
922
+ return None
923
+ df = pd.DataFrame({"feature": feature_names, "importance": importance})
924
+ return df.sort_values("importance", ascending=False).reset_index(drop=True)
925
+
926
+
927
+ def log_feature_importance(model, feature_names):
928
+ fi = get_feature_importance(model, feature_names)
929
+ if fi is None:
930
+ return
931
+ fi.to_csv("feature_importance.csv", index=False)
932
+ mlflow.log_artifact("feature_importance.csv")
933
+
934
+
935
+ def train_xgboost(X_train, y_train, X_test, y_test, feature_names):
936
+ mlflow.xgboost.autolog(log_datasets=False, log_models=False)
937
+ dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
938
+ dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names)
939
+ params = {"objective": "binary:logistic", "eval_metric": ["auc", "logloss"],
940
+ "max_depth": 6, "learning_rate": 0.1, "seed": 42}
941
+ model = xgb.train(params, dtrain, num_boost_round=100,
942
+ evals=[(dtrain, "train"), (dtest, "eval")], verbose_eval=False)
943
+ return model
944
+
945
+
946
+ def get_model_name_with_hash(base_name: str) -> str:
947
+ if RECOMMENDATIONS_HASH:
948
+ return f"{base_name}_{RECOMMENDATIONS_HASH}"
949
+ return base_name
950
+
951
+
952
+ def run_experiment():
953
+ mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
954
+ experiment = mlflow.get_experiment_by_name(PIPELINE_NAME)
955
+ if experiment is None:
956
+ mlflow.create_experiment(PIPELINE_NAME, artifact_location=MLFLOW_ARTIFACT_ROOT)
957
+ mlflow.set_experiment(PIPELINE_NAME)
958
+ print(f"MLflow tracking: {MLFLOW_TRACKING_URI}")
959
+ print(f"Artifacts: {MLFLOW_ARTIFACT_ROOT}")
960
+
961
+ # Load training data from Feast (ensures training/serving consistency)
962
+ print("\\nLoading training data from Feast...")
963
+ training_data = get_training_data_from_feast()
964
+
965
+ y = training_data[TARGET_COLUMN]
966
+ X = prepare_features(training_data.drop(columns=[TARGET_COLUMN]))
967
+ feature_names = list(X.columns)
968
+ train_mask = y.notna()
969
+ X, y = X.loc[train_mask], y.loc[train_mask]
970
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
971
+
972
+ sklearn_models = {
973
+ "logistic_regression": LogisticRegression(max_iter=5000, random_state=42),
974
+ "random_forest": RandomForestClassifier(n_estimators=100, random_state=42),
975
+ }
976
+
977
+ run_name = get_model_name_with_hash("pipeline_run")
978
+ with mlflow.start_run(run_name=run_name):
979
+ mlflow.log_params({"train_samples": len(X_train), "test_samples": len(X_test), "n_features": X.shape[1]})
980
+ mlflow.set_tag("feature_source", "feast")
981
+ mlflow.set_tag("feast_feature_view", FEAST_FEATURE_VIEW)
982
+ if RECOMMENDATIONS_HASH:
983
+ mlflow.set_tag("recommendations_hash", RECOMMENDATIONS_HASH)
984
+ best_model, best_auc = None, 0
985
+
986
+ for name, model in sklearn_models.items():
987
+ with mlflow.start_run(run_name=name, nested=True):
988
+ if RECOMMENDATIONS_HASH:
989
+ mlflow.set_tag("recommendations_hash", RECOMMENDATIONS_HASH)
990
+ mlflow.set_tag("feature_source", "feast")
991
+ model.fit(X_train, y_train)
992
+ y_proba = model.predict_proba(X_test)[:, 1]
993
+ y_pred = model.predict(X_test)
994
+ metrics = compute_metrics(y_test, y_proba, y_pred)
995
+ cv = cross_val_score(model, X_train, y_train, cv=5, scoring="roc_auc")
996
+ mlflow.log_metrics({**metrics, "cv_mean": cv.mean(), "cv_std": cv.std()})
997
+ log_feature_importance(model, feature_names)
998
+ model_artifact_name = get_model_name_with_hash(f"model_{name}")
999
+ mlflow.sklearn.log_model(model, name=model_artifact_name)
1000
+ print(f"{name}: ROC-AUC={metrics['roc_auc']:.4f}, PR-AUC={metrics['pr_auc']:.4f}, F1={metrics['f1']:.4f}")
1001
+ if metrics["roc_auc"] > best_auc:
1002
+ best_auc, best_model = metrics["roc_auc"], name
1003
+
1004
+ with mlflow.start_run(run_name="xgboost", nested=True):
1005
+ if RECOMMENDATIONS_HASH:
1006
+ mlflow.set_tag("recommendations_hash", RECOMMENDATIONS_HASH)
1007
+ mlflow.set_tag("feature_source", "feast")
1008
+ xgb_model = train_xgboost(X_train, y_train, X_test, y_test, feature_names)
1009
+ dtest = xgb.DMatrix(X_test, feature_names=feature_names)
1010
+ y_proba = xgb_model.predict(dtest)
1011
+ y_pred = (y_proba > 0.5).astype(int)
1012
+ metrics = compute_metrics(y_test, y_proba, y_pred)
1013
+ mlflow.log_metrics(metrics)
1014
+ xgb_model_name = get_model_name_with_hash("model_xgboost")
1015
+ mlflow.xgboost.log_model(xgb_model, name=xgb_model_name)
1016
+ importance = xgb_model.get_score(importance_type="gain")
1017
+ fi = pd.DataFrame({"feature": importance.keys(), "importance": importance.values()})
1018
+ fi = fi.sort_values("importance", ascending=False).reset_index(drop=True)
1019
+ fi.to_csv("feature_importance.csv", index=False)
1020
+ mlflow.log_artifact("feature_importance.csv")
1021
+ print(f"xgboost: ROC-AUC={metrics['roc_auc']:.4f}, PR-AUC={metrics['pr_auc']:.4f}, F1={metrics['f1']:.4f}")
1022
+ if metrics["roc_auc"] > best_auc:
1023
+ best_auc, best_model = metrics["roc_auc"], "xgboost"
1024
+
1025
+ mlflow.set_tag("best_model", best_model)
1026
+ mlflow.log_metric("best_roc_auc", best_auc)
1027
+ print(f"Best: {best_model} (ROC-AUC={best_auc:.4f})")
1028
+
1029
+
1030
+ if __name__ == "__main__":
1031
+ run_experiment()
1032
+ ''',
1033
+ "runner.py.j2": '''import argparse
1034
+ from concurrent.futures import ThreadPoolExecutor
1035
+ from config import PIPELINE_NAME, EXPERIMENTS_DIR, PRODUCTION_DIR
1036
+ {% for name in config.landing %}
1037
+ from landing.landing_{{ name }} import run_landing_{{ name }}
1038
+ {% endfor %}
1039
+ {% for name, _ in config.bronze.items() %}
1040
+ from bronze.bronze_{{ name }} import run_bronze_{{ name }}
1041
+ {% endfor %}
1042
+ {% for name, _ in config.bronze_event.items() %}
1043
+ from bronze.bronze_{{ name }} import run_bronze_{{ name }}
1044
+ {% endfor %}
1045
+ from silver.silver_merge import run_silver_merge
1046
+ from gold.gold_features import run_gold_features
1047
+ from training.ml_experiment import run_experiment
1048
+
1049
+
1050
+ def setup_experiments_dir():
1051
+ EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)
1052
+ (EXPERIMENTS_DIR / "mlruns").mkdir(parents=True, exist_ok=True)
1053
+ PRODUCTION_DIR.mkdir(parents=True, exist_ok=True)
1054
+ (PRODUCTION_DIR / "data" / "bronze").mkdir(parents=True, exist_ok=True)
1055
+ (PRODUCTION_DIR / "data" / "silver").mkdir(parents=True, exist_ok=True)
1056
+ (PRODUCTION_DIR / "data" / "gold").mkdir(parents=True, exist_ok=True)
1057
+
1058
+
1059
+ def run_pipeline(validate=False):
1060
+ print(f"Starting pipeline: {PIPELINE_NAME}")
1061
+ setup_experiments_dir()
1062
+ {% if config.landing %}
1063
+
1064
+ print("\\n[1/6] Landing (event sources)...")
1065
+ {% for name in config.landing %}
1066
+ run_landing_{{ name }}()
1067
+ {% endfor %}
1068
+ print("Landing complete")
1069
+ if validate:
1070
+ from validation.validate_pipeline import validate_landing
1071
+ validate_landing()
1072
+ {% endif %}
1073
+
1074
+ print("\\n[{{ '2/6' if config.landing else '1/4' }}] Bronze (parallel)...")
1075
+ with ThreadPoolExecutor(max_workers={{ (config.bronze | length) + (config.bronze_event | length) }}) as executor:
1076
+ bronze_futures = [
1077
+ {% for name in config.bronze %}
1078
+ executor.submit(run_bronze_{{ name }}),
1079
+ {% endfor %}
1080
+ {% for name in config.bronze_event %}
1081
+ executor.submit(run_bronze_{{ name }}),
1082
+ {% endfor %}
1083
+ ]
1084
+ for f in bronze_futures:
1085
+ f.result()
1086
+ print("Bronze complete")
1087
+ if validate:
1088
+ from validation.validate_pipeline import validate_bronze
1089
+ validate_bronze()
1090
+
1091
+ print("\\n[{{ '3/6' if config.landing else '2/4' }}] Silver...")
1092
+ run_silver_merge()
1093
+ print("Silver complete")
1094
+ if validate:
1095
+ from validation.validate_pipeline import validate_silver
1096
+ validate_silver()
1097
+
1098
+ print("\\n[{{ '4/6' if config.landing else '3/4' }}] Gold...")
1099
+ run_gold_features()
1100
+ print("Gold complete")
1101
+ if validate:
1102
+ from validation.validate_pipeline import validate_gold
1103
+ validate_gold()
1104
+
1105
+ print("\\n[{{ '5/6' if config.landing else '4/4' }}] Training...")
1106
+ run_experiment()
1107
+ print("Training complete")
1108
+ if validate:
1109
+ from validation.validate_pipeline import validate_training
1110
+ validate_training()
1111
+
1112
+
1113
+ if __name__ == "__main__":
1114
+ parser = argparse.ArgumentParser()
1115
+ parser.add_argument("--validate", action="store_true")
1116
+ args = parser.parse_args()
1117
+ run_pipeline(validate=args.validate)
1118
+ ''',
1119
+ "run_all.py.j2": '''"""{{ config.name }} - Pipeline Runner with MLflow UI
1120
+
1121
+ All artifacts (data, mlruns, feast) are stored in the experiments directory.
1122
+ Override location with CR_EXPERIMENTS_DIR environment variable.
1123
+ """
1124
+ import os
1125
+ import sys
1126
+ import webbrowser
1127
+ import subprocess
1128
+ import time
1129
+ from pathlib import Path
1130
+ from concurrent.futures import ThreadPoolExecutor
1131
+
1132
+ sys.path.insert(0, str(Path(__file__).parent))
1133
+
1134
+ from config import PIPELINE_NAME, SOURCES, MLFLOW_TRACKING_URI, EXPERIMENTS_DIR, PRODUCTION_DIR, FINDINGS_DIR
1135
+ {% for name in config.landing %}
1136
+ from landing.landing_{{ name }} import run_landing_{{ name }}
1137
+ {% endfor %}
1138
+ {% for name in config.bronze %}
1139
+ from bronze.bronze_{{ name }} import run_bronze_{{ name }}
1140
+ {% endfor %}
1141
+ {% for name in config.bronze_event %}
1142
+ from bronze.bronze_{{ name }} import run_bronze_{{ name }}
1143
+ {% endfor %}
1144
+ from silver.silver_merge import run_silver_merge
1145
+ from gold.gold_features import run_gold_features
1146
+ from training.ml_experiment import run_experiment
1147
+
1148
+
1149
+ def setup_experiments_dir():
1150
+ EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)
1151
+ (EXPERIMENTS_DIR / "mlruns").mkdir(parents=True, exist_ok=True)
1152
+ PRODUCTION_DIR.mkdir(parents=True, exist_ok=True)
1153
+ (PRODUCTION_DIR / "data" / "bronze").mkdir(parents=True, exist_ok=True)
1154
+ (PRODUCTION_DIR / "data" / "silver").mkdir(parents=True, exist_ok=True)
1155
+ (PRODUCTION_DIR / "data" / "gold").mkdir(parents=True, exist_ok=True)
1156
+ print(f"Experiments directory: {EXPERIMENTS_DIR}")
1157
+ print(f"Production directory: {PRODUCTION_DIR}")
1158
+ print(f"MLflow tracking: {MLFLOW_TRACKING_URI}")
1159
+ print(f"Findings directory: {FINDINGS_DIR}")
1160
+
1161
+
1162
+ def run_landing():
1163
+ {% for name in config.landing %}
1164
+ run_landing_{{ name }}()
1165
+ {% endfor %}
1166
+ pass
1167
+
1168
+
1169
+ def run_bronze_parallel():
1170
+ bronze_funcs = [
1171
+ {% for name in config.bronze %}
1172
+ run_bronze_{{ name }},
1173
+ {% endfor %}
1174
+ {% for name in config.bronze_event %}
1175
+ run_bronze_{{ name }},
1176
+ {% endfor %}
1177
+ ]
1178
+ with ThreadPoolExecutor(max_workers={{ (config.bronze | length) + (config.bronze_event | length) }}) as ex:
1179
+ list(ex.map(lambda f: f(), bronze_funcs))
1180
+
1181
+
1182
+ def is_port_in_use(port):
1183
+ import socket
1184
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
1185
+ return s.connect_ex(('localhost', port)) == 0
1186
+
1187
+
1188
+ def start_mlflow_ui():
1189
+ port = 5050
1190
+ if is_port_in_use(port):
1191
+ print(f"\\n⚠ Port {port} is already in use.")
1192
+ print(f" Either mlflow is already running, or kill the old process:")
1193
+ print(f" pkill -f 'mlflow ui'")
1194
+ print(f"\\n Opening browser to existing server...")
1195
+ webbrowser.open(f"http://localhost:{port}")
1196
+ return None
1197
+
1198
+ print(f"\\nStarting MLflow UI (tracking: {MLFLOW_TRACKING_URI})...")
1199
+ process = subprocess.Popen(
1200
+ ["mlflow", "ui", "--backend-store-uri", MLFLOW_TRACKING_URI, "--port", str(port)],
1201
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
1202
+ )
1203
+ time.sleep(2)
1204
+ webbrowser.open(f"http://localhost:{port}")
1205
+ print(f"MLflow UI running at http://localhost:{port}")
1206
+ print("Press Ctrl+C to stop")
1207
+ return process
1208
+
1209
+
1210
+ def run_pipeline():
1211
+ print(f"Running {PIPELINE_NAME}")
1212
+ print("=" * 50)
1213
+
1214
+ setup_experiments_dir()
1215
+ {% if config.landing %}
1216
+
1217
+ print("\\n[1/6] Landing (event sources)...")
1218
+ run_landing()
1219
+ print("Landing complete")
1220
+ {% endif %}
1221
+
1222
+ print("\\n[{{ '2/6' if config.landing else '1/4' }}] Bronze (parallel)...")
1223
+ run_bronze_parallel()
1224
+ print("Bronze complete")
1225
+
1226
+ print("\\n[{{ '3/6' if config.landing else '2/4' }}] Silver...")
1227
+ run_silver_merge()
1228
+ print("Silver complete")
1229
+
1230
+ print("\\n[{{ '4/6' if config.landing else '3/4' }}] Gold...")
1231
+ run_gold_features()
1232
+ print("Gold complete")
1233
+
1234
+ print("\\n[{{ '5/6' if config.landing else '4/4' }}] Training...")
1235
+ run_experiment()
1236
+ print("Training complete")
1237
+
1238
+ print("\\n" + "=" * 50)
1239
+ print("Pipeline finished!")
1240
+
1241
+ mlflow_process = start_mlflow_ui()
1242
+ if mlflow_process:
1243
+ try:
1244
+ mlflow_process.wait()
1245
+ except KeyboardInterrupt:
1246
+ mlflow_process.terminate()
1247
+ print("\\nMLflow UI stopped")
1248
+
1249
+
1250
+ if __name__ == "__main__":
1251
+ run_pipeline()
1252
+ ''',
1253
+ "workflow.json.j2": """{
1254
+ "name": "{{ config.name }}_pipeline",
1255
+ "tasks": [
1256
+ {% for name in config.landing %}
1257
+ {
1258
+ "task_key": "landing_{{ name }}",
1259
+ "notebook_task": {
1260
+ "notebook_path": "/Workspace/orchestration/{{ config.name }}/landing/landing_{{ name }}"
1261
+ }
1262
+ },
1263
+ {% endfor %}
1264
+ {% for source in config.sources %}
1265
+ {
1266
+ "task_key": "bronze_{{ source.name }}",
1267
+ {% if config.landing %}
1268
+ "depends_on": [
1269
+ {% for name in config.landing %}
1270
+ {"task_key": "landing_{{ name }}"}{{ "," if not loop.last else "" }}
1271
+ {% endfor %}
1272
+ ],
1273
+ {% endif %}
1274
+ "notebook_task": {
1275
+ "notebook_path": "/Workspace/orchestration/{{ config.name }}/bronze/bronze_{{ source.name }}"
1276
+ }
1277
+ },
1278
+ {% endfor %}
1279
+ {
1280
+ "task_key": "silver_merge",
1281
+ "depends_on": [
1282
+ {% for source in config.sources %}
1283
+ {"task_key": "bronze_{{ source.name }}"}{{ "," if not loop.last else "" }}
1284
+ {% endfor %}
1285
+ ],
1286
+ "notebook_task": {
1287
+ "notebook_path": "/Workspace/orchestration/{{ config.name }}/silver/silver_merge"
1288
+ }
1289
+ },
1290
+ {
1291
+ "task_key": "gold_features",
1292
+ "depends_on": [{"task_key": "silver_merge"}],
1293
+ "notebook_task": {
1294
+ "notebook_path": "/Workspace/orchestration/{{ config.name }}/gold/gold_features"
1295
+ }
1296
+ },
1297
+ {
1298
+ "task_key": "ml_experiment",
1299
+ "depends_on": [{"task_key": "gold_features"}],
1300
+ "notebook_task": {
1301
+ "notebook_path": "/Workspace/orchestration/{{ config.name }}/training/ml_experiment"
1302
+ }
1303
+ }
1304
+ ]
1305
+ }
1306
+ """,
1307
+ "feature_store.yaml.j2": """project: {{ config.name }}
1308
+ registry: data/registry.db
1309
+ provider: local
1310
+ online_store:
1311
+ type: sqlite
1312
+ path: data/online_store.db
1313
+ offline_store:
1314
+ type: file
1315
+ entity_key_serialization_version: 2
1316
+ """,
1317
+ "features.py.j2": '''"""Feast Feature Definitions for {{ config.name }}
1318
+
1319
+ Auto-generated feature view definitions for training/serving consistency.
1320
+ Feature version: {{ config.recommendations_hash or "unversioned" }}
1321
+ """
1322
+ from datetime import timedelta
1323
+ from feast import Entity, FeatureView, Field, FileSource
1324
+ from feast.types import Float32, Float64, Int64, String
1325
+
1326
+ # Entity definition
1327
+ {{ config.feast.entity_name if config.feast else "customer" }} = Entity(
1328
+ name="{{ config.feast.entity_name if config.feast else 'customer' }}",
1329
+ join_keys=["{{ config.feast.entity_key if config.feast else config.sources[0].entity_key }}"],
1330
+ description="Primary entity for {{ config.name }} pipeline"
1331
+ )
1332
+
1333
+ # File source pointing to materialized features
1334
+ {{ config.feast.feature_view_name if config.feast else config.name + '_features' }}_source = FileSource(
1335
+ path="data/{{ config.feast.feature_view_name if config.feast else config.name + '_features' }}.parquet",
1336
+ timestamp_field="{{ config.feast.timestamp_column if config.feast else 'event_timestamp' }}"
1337
+ )
1338
+
1339
+ # Feature view definition
1340
+ # Note: Features are dynamically determined from the parquet file schema
1341
+ # This is a placeholder that gets populated when feast apply is run
1342
+ {{ config.feast.feature_view_name if config.feast else config.name + '_features' }} = FeatureView(
1343
+ name="{{ config.feast.feature_view_name if config.feast else config.name + '_features' }}",
1344
+ entities=[{{ config.feast.entity_name if config.feast else "customer" }}],
1345
+ ttl=timedelta(days={{ config.feast.ttl_days if config.feast else 365 }}),
1346
+ source={{ config.feast.feature_view_name if config.feast else config.name + '_features' }}_source,
1347
+ tags={
1348
+ "pipeline": "{{ config.name }}",
1349
+ "recommendations_hash": "{{ config.recommendations_hash or 'none' }}",
1350
+ "version": "v1.0.0_{{ config.recommendations_hash or 'unversioned' }}"
1351
+ }
1352
+ )
1353
+ ''',
1354
+ "landing.py.j2": '''import pandas as pd
1355
+ import numpy as np
1356
+ from pathlib import Path
1357
+ from config import RAW_SOURCES, PRODUCTION_DIR
1358
+
1359
+ SOURCE_NAME = "{{ name }}"
1360
+ ENTITY_COLUMN = "{{ config.entity_column }}"
1361
+ TIME_COLUMN = "{{ config.time_column }}"
1362
+ TARGET_COLUMN = "{{ config.target_column }}"
1363
+
1364
+
1365
+ def load_raw_data() -> pd.DataFrame:
1366
+ source = RAW_SOURCES[SOURCE_NAME]
1367
+ path = Path(source["path"])
1368
+ if path.is_dir() and (path / "_delta_log").is_dir():
1369
+ from customer_retention.integrations.adapters.factory import get_delta
1370
+ return get_delta(force_local=True).read(str(path))
1371
+ if not path.exists():
1372
+ raise FileNotFoundError(f"Raw source not found: {path}")
1373
+ if source["format"] == "csv":
1374
+ return pd.read_csv(path)
1375
+ return pd.read_parquet(path)
1376
+
1377
+ {% if config.timestamp_coalesce %}
1378
+
1379
+ def coalesce_timestamps(df: pd.DataFrame) -> pd.DataFrame:
1380
+ {% set cols = config.timestamp_coalesce.datetime_columns_ordered %}
1381
+ {% set out = config.timestamp_coalesce.output_column %}
1382
+ df["{{ out }}"] = pd.to_datetime(df["{{ cols[-1] }}"], errors="coerce")
1383
+ {% for col in cols[:-1] | reverse %}
1384
+ df["{{ out }}"] = df["{{ out }}"].fillna(pd.to_datetime(df["{{ col }}"], errors="coerce"))
1385
+ {% endfor %}
1386
+ return df
1387
+ {% endif %}
1388
+
1389
+ {% if config.label_timestamp %}
1390
+
1391
+ def derive_label_timestamp(df: pd.DataFrame) -> pd.DataFrame:
1392
+ {% set lt = config.label_timestamp %}
1393
+ {% set feature_ts = config.timestamp_coalesce.output_column if config.timestamp_coalesce else config.time_column %}
1394
+ {% if lt.label_column %}
1395
+ df["{{ lt.output_column }}"] = pd.to_datetime(df["{{ lt.label_column }}"], errors="coerce")
1396
+ df["{{ lt.output_column }}"] = df["{{ lt.output_column }}"].fillna(
1397
+ pd.to_datetime(df["{{ feature_ts }}"], errors="coerce") + pd.Timedelta(days={{ lt.fallback_window_days }})
1398
+ )
1399
+ {% else %}
1400
+ df["{{ lt.output_column }}"] = pd.to_datetime(df["{{ feature_ts }}"], errors="coerce") + pd.Timedelta(days={{ lt.fallback_window_days }})
1401
+ {% endif %}
1402
+ return df
1403
+ {% endif %}
1404
+
1405
+
1406
+ def get_landing_output_path() -> Path:
1407
+ return PRODUCTION_DIR / "data" / "landing" / f"{SOURCE_NAME}.parquet"
1408
+
1409
+
1410
+ def run_landing_{{ name }}():
1411
+ print(f"Landing: {SOURCE_NAME}")
1412
+ df = load_raw_data()
1413
+ print(f" Raw records: {len(df):,}")
1414
+ {% if config.raw_time_column %}
1415
+ df = df.rename(columns={"{{ config.raw_time_column }}": TIME_COLUMN})
1416
+ {% endif %}
1417
+ {% if config.original_target_column %}
1418
+ df = df.rename(columns={"{{ config.original_target_column }}": TARGET_COLUMN})
1419
+ {% endif %}
1420
+ {% if config.timestamp_coalesce %}
1421
+ df = coalesce_timestamps(df)
1422
+ {% endif %}
1423
+ {% if config.label_timestamp %}
1424
+ df = derive_label_timestamp(df)
1425
+ {% endif %}
1426
+ output_path = get_landing_output_path()
1427
+ output_dir = output_path.parent
1428
+ output_dir.mkdir(parents=True, exist_ok=True)
1429
+ try:
1430
+ from customer_retention.integrations.adapters.factory import get_delta
1431
+ storage = get_delta(force_local=True)
1432
+ storage.write(df, str(output_dir / SOURCE_NAME))
1433
+ except ImportError:
1434
+ df.to_parquet(output_path, index=False)
1435
+ print(f" Records: {len(df):,}")
1436
+ print(f" Output: {output_path}")
1437
+ return df
1438
+
1439
+
1440
+ if __name__ == "__main__":
1441
+ run_landing_{{ name }}()
1442
+ ''',
1443
+ "bronze_event.py.j2": '''import pandas as pd
1444
+ import numpy as np
1445
+ from pathlib import Path
1446
+ {% set ops, fitted = collect_imports(config.pre_shaping + config.post_shaping, False) %}
1447
+ {% if ops %}
1448
+ from customer_retention.transforms import {{ ops | sort | join(', ') }}
1449
+ {% endif %}
1450
+ from config import PRODUCTION_DIR, RAW_SOURCES, TARGET_COLUMN
1451
+
1452
+ SOURCE_NAME = "{{ source }}"
1453
+ ENTITY_COLUMN = "{{ config.entity_column }}"
1454
+ TIME_COLUMN = "{{ config.time_column }}"
1455
+
1456
+ {% set pre_groups = group_steps(config.pre_shaping) %}
1457
+
1458
+ def apply_pre_shaping(df: pd.DataFrame) -> pd.DataFrame:
1459
+ {% if config.deduplicate %}
1460
+ df = df.drop_duplicates(subset=[ENTITY_COLUMN, TIME_COLUMN], keep="first")
1461
+ {% endif %}
1462
+ {%- if pre_groups %}
1463
+ {%- for func_name, steps in pre_groups %}
1464
+ df = {{ func_name }}(df)
1465
+ {%- endfor %}
1466
+ {%- endif %}
1467
+ return df
1468
+
1469
+ {% for func_name, steps in pre_groups %}
1470
+
1471
+ def {{ func_name }}(df: pd.DataFrame) -> pd.DataFrame:
1472
+ {%- set _prov = provenance_docstring_block(steps) %}
1473
+ {%- if _prov %}
1474
+ {{ _prov }}
1475
+ {%- endif %}
1476
+ {%- for t in steps %}
1477
+ # {{ t.rationale }}
1478
+ # {{ action_description(t) }}
1479
+ df = {{ render_step_call(t) }}
1480
+ {%- endfor %}
1481
+ return df
1482
+ {% endfor %}
1483
+
1484
+ {% if config.aggregation %}
1485
+ def _parse_window(window_str):
1486
+ if window_str == "all_time":
1487
+ return None
1488
+ if window_str.endswith("d"):
1489
+ return pd.Timedelta(days=int(window_str[:-1]))
1490
+ if window_str.endswith("h"):
1491
+ return pd.Timedelta(hours=int(window_str[:-1]))
1492
+ if window_str.endswith("w"):
1493
+ return pd.Timedelta(weeks=int(window_str[:-1]))
1494
+ return pd.Timedelta(days=int(window_str))
1495
+
1496
+
1497
+ AGGREGATION_WINDOWS = {{ config.aggregation.windows }}
1498
+ VALUE_COLUMNS = {{ config.aggregation.value_columns }}
1499
+ AGG_FUNCS = {{ config.aggregation.agg_funcs }}
1500
+ {% endif %}
1501
+
1502
+
1503
+ def apply_reshaping(df: pd.DataFrame) -> pd.DataFrame:
1504
+ {% if config.aggregation %}
1505
+ df[TIME_COLUMN] = pd.to_datetime(df[TIME_COLUMN])
1506
+ reference_date = df[TIME_COLUMN].max()
1507
+ result = df.groupby(ENTITY_COLUMN).agg("first")[[]]
1508
+ if TARGET_COLUMN in df.columns:
1509
+ result[TARGET_COLUMN] = df.groupby(ENTITY_COLUMN)[TARGET_COLUMN].first()
1510
+ for window in AGGREGATION_WINDOWS:
1511
+ td = _parse_window(window)
1512
+ window_df = df if td is None else df[df[TIME_COLUMN] >= (reference_date - td)]
1513
+ for col in VALUE_COLUMNS:
1514
+ for func in AGG_FUNCS:
1515
+ result[f"{col}_{func}_{window}"] = window_df.groupby(ENTITY_COLUMN)[col].agg(func)
1516
+ result[f"event_count_{window}"] = window_df.groupby(ENTITY_COLUMN).size()
1517
+ df = result.reset_index()
1518
+ {% endif %}
1519
+ return df
1520
+
1521
+ {% if config.lifecycle %}
1522
+
1523
+ def _load_raw_events():
1524
+ source = RAW_SOURCES[SOURCE_NAME]
1525
+ path = Path(source["path"])
1526
+ if path.is_dir() and (path / "_delta_log").is_dir():
1527
+ from customer_retention.integrations.adapters.factory import get_delta
1528
+ return get_delta(force_local=True).read(str(path))
1529
+ if not path.exists():
1530
+ raise FileNotFoundError(f"Raw source not found: {path}")
1531
+ if source["format"] == "csv":
1532
+ return pd.read_csv(path)
1533
+ return pd.read_parquet(path)
1534
+
1535
+ {% if config.lifecycle.include_recency_bucket %}
1536
+
1537
+ def add_recency_tenure(df: pd.DataFrame, raw_df: pd.DataFrame) -> pd.DataFrame:
1538
+ raw_df[TIME_COLUMN] = pd.to_datetime(raw_df[TIME_COLUMN])
1539
+ reference_date = raw_df[TIME_COLUMN].max()
1540
+ entity_stats = raw_df.groupby(ENTITY_COLUMN)[TIME_COLUMN].agg(["min", "max"])
1541
+ entity_stats["days_since_last"] = (reference_date - entity_stats["max"]).dt.days
1542
+ entity_stats["days_since_first"] = (reference_date - entity_stats["min"]).dt.days
1543
+ df = df.merge(entity_stats[["days_since_last", "days_since_first"]], left_on=ENTITY_COLUMN, right_index=True, how="left")
1544
+ return df
1545
+
1546
+
1547
+ def add_recency_buckets(df: pd.DataFrame) -> pd.DataFrame:
1548
+ if "days_since_last" in df.columns:
1549
+ df["recency_bucket"] = pd.cut(df["days_since_last"], bins=[0, 7, 30, 90, 180, 365, float("inf")],
1550
+ labels=["0-7d", "7-30d", "30-90d", "90-180d", "180-365d", "365d+"])
1551
+ return df
1552
+
1553
+ {% endif %}
1554
+ {% if config.lifecycle.include_lifecycle_quadrant %}
1555
+
1556
+ def add_lifecycle_quadrant(df: pd.DataFrame) -> pd.DataFrame:
1557
+ if "days_since_first" not in df.columns:
1558
+ return df
1559
+ tenure = df["days_since_first"]
1560
+ intensity_col = [c for c in df.columns if c.startswith("event_count_")]
1561
+ if not intensity_col:
1562
+ return df
1563
+ intensity = df[intensity_col[0]]
1564
+ tenure_med = tenure.median()
1565
+ intensity_med = intensity.median()
1566
+ conditions = [
1567
+ (tenure >= tenure_med) & (intensity >= intensity_med),
1568
+ (tenure >= tenure_med) & (intensity < intensity_med),
1569
+ (tenure < tenure_med) & (intensity >= intensity_med),
1570
+ (tenure < tenure_med) & (intensity < intensity_med),
1571
+ ]
1572
+ labels = ["loyal", "at_risk", "new_active", "new_inactive"]
1573
+ df["lifecycle_quadrant"] = np.select(conditions, labels, default="unknown")
1574
+ return df
1575
+
1576
+ {% endif %}
1577
+ {% if config.lifecycle.include_cyclical_features %}
1578
+
1579
+ def add_cyclical_features(df: pd.DataFrame, raw_df: pd.DataFrame) -> pd.DataFrame:
1580
+ raw_df[TIME_COLUMN] = pd.to_datetime(raw_df[TIME_COLUMN])
1581
+ mean_dow = raw_df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.dayofweek.mean())
1582
+ df = df.merge(mean_dow.rename("mean_dow"), left_on=ENTITY_COLUMN, right_index=True, how="left")
1583
+ df["dow_sin"] = np.sin(2 * np.pi * df["mean_dow"] / 7)
1584
+ df["dow_cos"] = np.cos(2 * np.pi * df["mean_dow"] / 7)
1585
+ df = df.drop(columns=["mean_dow"], errors="ignore")
1586
+ return df
1587
+
1588
+ {% endif %}
1589
+ {% if config.lifecycle.momentum_pairs %}
1590
+
1591
+ def add_momentum_ratios(df: pd.DataFrame) -> pd.DataFrame:
1592
+ {% for pair in config.lifecycle.momentum_pairs %}
1593
+ short_col = "event_count_{{ pair.short_window }}"
1594
+ long_col = "event_count_{{ pair.long_window }}"
1595
+ if short_col in df.columns and long_col in df.columns:
1596
+ df["momentum_{{ pair.short_window }}_{{ pair.long_window }}"] = df[short_col] / df[long_col].replace(0, float("nan"))
1597
+ {% endfor %}
1598
+ return df
1599
+
1600
+ {% endif %}
1601
+
1602
+ def enrich_lifecycle(df: pd.DataFrame) -> pd.DataFrame:
1603
+ raw_df = _load_raw_events()
1604
+ {% if config.raw_time_column %}
1605
+ raw_df = raw_df.rename(columns={"{{ config.raw_time_column }}": TIME_COLUMN})
1606
+ {% endif %}
1607
+ {% if config.lifecycle.include_recency_bucket %}
1608
+ df = add_recency_tenure(df, raw_df)
1609
+ df = add_recency_buckets(df)
1610
+ {% endif %}
1611
+ {% if config.lifecycle.include_lifecycle_quadrant %}
1612
+ df = add_lifecycle_quadrant(df)
1613
+ {% endif %}
1614
+ {% if config.lifecycle.include_cyclical_features %}
1615
+ df = add_cyclical_features(df, raw_df)
1616
+ {% endif %}
1617
+ {% if config.lifecycle.momentum_pairs %}
1618
+ df = add_momentum_ratios(df)
1619
+ {% endif %}
1620
+ return df
1621
+ {% endif %}
1622
+
1623
+ {% set post_groups = group_steps(config.post_shaping) %}
1624
+
1625
+ def apply_post_shaping(df: pd.DataFrame) -> pd.DataFrame:
1626
+ {% if config.lifecycle %}
1627
+ df = enrich_lifecycle(df)
1628
+ {% endif %}
1629
+ {%- if post_groups %}
1630
+ {%- for func_name, steps in post_groups %}
1631
+ df = {{ func_name }}(df)
1632
+ {%- endfor %}
1633
+ {%- endif %}
1634
+ return df
1635
+
1636
+ {% for func_name, steps in post_groups %}
1637
+
1638
+ def {{ func_name }}(df: pd.DataFrame) -> pd.DataFrame:
1639
+ {%- set _prov = provenance_docstring_block(steps) %}
1640
+ {%- if _prov %}
1641
+ {{ _prov }}
1642
+ {%- endif %}
1643
+ {%- for t in steps %}
1644
+ # {{ t.rationale }}
1645
+ # {{ action_description(t) }}
1646
+ df = {{ render_step_call(t) }}
1647
+ {%- endfor %}
1648
+ return df
1649
+ {% endfor %}
1650
+
1651
+
1652
+ def run_bronze_{{ source }}():
1653
+ landing_dir = PRODUCTION_DIR / "data" / "landing" / SOURCE_NAME
1654
+ landing_parquet = PRODUCTION_DIR / "data" / "landing" / f"{SOURCE_NAME}.parquet"
1655
+ if landing_dir.is_dir() and (landing_dir / "_delta_log").is_dir():
1656
+ from customer_retention.integrations.adapters.factory import get_delta
1657
+ df = get_delta(force_local=True).read(str(landing_dir))
1658
+ elif landing_parquet.exists():
1659
+ df = pd.read_parquet(landing_parquet)
1660
+ else:
1661
+ raise FileNotFoundError(f"Landing output not found: {landing_parquet}")
1662
+ df = apply_pre_shaping(df)
1663
+ df = apply_reshaping(df)
1664
+ df = apply_post_shaping(df)
1665
+ bronze_dir = PRODUCTION_DIR / "data" / "bronze"
1666
+ bronze_dir.mkdir(parents=True, exist_ok=True)
1667
+ try:
1668
+ from customer_retention.integrations.adapters.factory import get_delta
1669
+ storage = get_delta(force_local=True)
1670
+ storage.write(df, str(bronze_dir / SOURCE_NAME))
1671
+ except ImportError:
1672
+ output_path = bronze_dir / f"{SOURCE_NAME}.parquet"
1673
+ df.to_parquet(output_path, index=False)
1674
+ return df
1675
+
1676
+
1677
+ if __name__ == "__main__":
1678
+ run_bronze_{{ source }}()
1679
+ ''',
1680
+ "validate.py.j2": '''import sys
1681
+ from pathlib import Path
1682
+
1683
+ sys.path.insert(0, str(Path(__file__).parent.parent))
1684
+
1685
+ import pandas as pd
1686
+ import numpy as np
1687
+ from config import SOURCES, EXPLORATION_ARTIFACTS, EXPERIMENTS_DIR, PRODUCTION_DIR, TARGET_COLUMN
1688
+
1689
+
1690
+ def _load_artifact(path):
1691
+ path = Path(path)
1692
+ if path.is_dir() and (path / "_delta_log").is_dir():
1693
+ from customer_retention.integrations.adapters.factory import get_delta
1694
+ return get_delta(force_local=True).read(str(path))
1695
+ return pd.read_parquet(path)
1696
+
1697
+
1698
+ def _compare_dataframes(stage, production_path, exploration_path, entity_key=None, tolerance=1e-5):
1699
+ if not Path(production_path).exists() and not (Path(production_path).is_dir() and (Path(production_path) / "_delta_log").is_dir()):
1700
+ raise FileNotFoundError(f"[{stage}] Production output not found: {production_path}")
1701
+ if not Path(exploration_path).exists() and not (Path(exploration_path).is_dir() and (Path(exploration_path) / "_delta_log").is_dir()):
1702
+ print(f"[{stage}] SKIP - exploration artifact not found: {exploration_path}")
1703
+ return True
1704
+
1705
+ prod = _load_artifact(production_path)
1706
+ expl = _load_artifact(exploration_path)
1707
+
1708
+ if entity_key and entity_key in prod.columns and entity_key in expl.columns:
1709
+ prod = prod.sort_values(entity_key).reset_index(drop=True)
1710
+ expl = expl.sort_values(entity_key).reset_index(drop=True)
1711
+
1712
+ if prod.shape[0] != expl.shape[0]:
1713
+ raise AssertionError(f"[{stage}] Row count: production={prod.shape[0]} vs exploration={expl.shape[0]}")
1714
+
1715
+ prod_cols = set(prod.columns)
1716
+ expl_cols = set(expl.columns)
1717
+ missing = expl_cols - prod_cols
1718
+ extra = prod_cols - expl_cols
1719
+ if missing:
1720
+ print(f"[{stage}] WARNING: missing columns: {missing}")
1721
+ if extra:
1722
+ print(f"[{stage}] INFO: extra columns: {extra}")
1723
+
1724
+ common = sorted(prod_cols & expl_cols)
1725
+ for col in common:
1726
+ if pd.api.types.is_numeric_dtype(prod[col]) and pd.api.types.is_numeric_dtype(expl[col]):
1727
+ try:
1728
+ pd.testing.assert_series_equal(prod[col], expl[col], check_exact=False, rtol=tolerance, check_names=False)
1729
+ except AssertionError as e:
1730
+ delta = (prod[col].astype(float) - expl[col].astype(float)).abs()
1731
+ max_idx = delta.idxmax()
1732
+ raise AssertionError(
1733
+ f"[{stage}] Column '{col}' diverges at row {max_idx}: "
1734
+ f"production={prod[col].iloc[max_idx]} vs exploration={expl[col].iloc[max_idx]} "
1735
+ f"(max delta={delta.max():.2e})"
1736
+ ) from None
1737
+
1738
+ print(f"[{stage}] PASS - {prod.shape[0]} rows, {len(common)} common cols, tolerance={tolerance}")
1739
+ return True
1740
+
1741
+
1742
+ def validate_landing(tolerance=1e-5):
1743
+ landing_dir = PRODUCTION_DIR / "data" / "landing"
1744
+ if not landing_dir.exists():
1745
+ print("[Landing] SKIP - no landing directory")
1746
+ return True
1747
+ for path in landing_dir.glob("*.parquet"):
1748
+ name = path.stem
1749
+ expl_key = f"landing_{name}" if f"landing_{name}" in EXPLORATION_ARTIFACTS else "landing"
1750
+ if expl_key in EXPLORATION_ARTIFACTS:
1751
+ _compare_dataframes(f"Landing/{name}", str(path), EXPLORATION_ARTIFACTS[expl_key])
1752
+ return True
1753
+
1754
+
1755
+ def validate_bronze(tolerance=1e-5):
1756
+ bronze_artifacts = EXPLORATION_ARTIFACTS.get("bronze", {})
1757
+ for name, expl_path in bronze_artifacts.items():
1758
+ prod_path = PRODUCTION_DIR / "data" / "bronze" / f"{name}.parquet"
1759
+ _compare_dataframes(f"Bronze/{name}", str(prod_path), expl_path, tolerance=tolerance)
1760
+ return True
1761
+
1762
+
1763
+ def validate_silver(tolerance=1e-5):
1764
+ prod_path = PRODUCTION_DIR / "data" / "silver" / "merged.parquet"
1765
+ expl_path = EXPLORATION_ARTIFACTS.get("silver", "")
1766
+ entity_key = list(SOURCES.values())[0]["entity_key"] if SOURCES else None
1767
+ _compare_dataframes("Silver", str(prod_path), expl_path, entity_key=entity_key, tolerance=tolerance)
1768
+ return True
1769
+
1770
+
1771
+ def validate_gold(tolerance=1e-5):
1772
+ prod_path = PRODUCTION_DIR / "data" / "gold" / "features.parquet"
1773
+ expl_path = EXPLORATION_ARTIFACTS.get("gold", "")
1774
+ entity_key = list(SOURCES.values())[0]["entity_key"] if SOURCES else None
1775
+ _compare_dataframes("Gold", str(prod_path), expl_path, entity_key=entity_key, tolerance=tolerance)
1776
+ return True
1777
+
1778
+
1779
+ def validate_training():
1780
+ print("[Training] PASS - training validation requires MLflow comparison (not yet implemented)")
1781
+ return True
1782
+
1783
+
1784
+ def validate_scoring(tolerance=1e-5):
1785
+ prod_path = PRODUCTION_DIR / "data" / "scoring" / "predictions.parquet"
1786
+ expl_path = EXPLORATION_ARTIFACTS.get("scoring", "")
1787
+ _compare_dataframes("Scoring", str(prod_path), expl_path, tolerance=tolerance)
1788
+ return True
1789
+
1790
+
1791
+ def run_all_validations(tolerance=1e-5):
1792
+ stages = [
1793
+ ("Landing", lambda: validate_landing(tolerance)),
1794
+ ("Bronze", lambda: validate_bronze(tolerance)),
1795
+ ("Silver", lambda: validate_silver(tolerance)),
1796
+ ("Gold", lambda: validate_gold(tolerance)),
1797
+ ("Training", validate_training),
1798
+ ("Scoring", lambda: validate_scoring(tolerance)),
1799
+ ]
1800
+ results = []
1801
+ for name, fn in stages:
1802
+ try:
1803
+ fn()
1804
+ results.append((name, "PASS"))
1805
+ except Exception as e:
1806
+ results.append((name, f"FAIL: {e}"))
1807
+ break
1808
+
1809
+ print("\\nStage Validation Report")
1810
+ print("=" * 50)
1811
+ for name, status in results:
1812
+ print(f"[{status.split(':')[0]:4s}] {name}")
1813
+ return results
1814
+ ''',
1815
+ "run_validation.py.j2": '''"""{{ config.name }} - Standalone Validation Runner
1816
+
1817
+ Compares pipeline outputs against exploration artifacts.
1818
+ Run after pipeline completes to verify correctness.
1819
+ """
1820
+ import sys
1821
+ from pathlib import Path
1822
+
1823
+ sys.path.insert(0, str(Path(__file__).parent.parent))
1824
+
1825
+ from validation.validate_pipeline import run_all_validations
1826
+
1827
+
1828
+ if __name__ == "__main__":
1829
+ import argparse
1830
+ parser = argparse.ArgumentParser(description="Validate pipeline outputs")
1831
+ parser.add_argument("--tolerance", type=float, default=1e-5)
1832
+ args = parser.parse_args()
1833
+
1834
+ results = run_all_validations(tolerance=args.tolerance)
1835
+ failures = [r for r in results if not r[1].startswith("PASS")]
1836
+ sys.exit(1 if failures else 0)
1837
+ ''',
1838
+ "exploration_report.py.j2": '''"""Exploration Report Viewer
1839
+
1840
+ Opens HTML documentation for the exploration notebooks that informed
1841
+ the pipeline transformations. Works both locally (file:// URI) and
1842
+ on Databricks (displayHTML with scroll-to-anchor injection).
1843
+ """
1844
+ import os
1845
+ import webbrowser
1846
+ from pathlib import Path
1847
+
1848
+ # Known notebooks referenced by pipeline provenance comments
1849
+ KNOWN_NOTEBOOKS = [
1850
+ {% for nb in notebooks %}
1851
+ "{{ nb }}",
1852
+ {% endfor %}
1853
+ ]
1854
+
1855
+ DOCS_DIR = Path(os.environ.get("CR_DOCS_BASE_URL", str(Path(__file__).parent)))
1856
+
1857
+
1858
+ def _is_databricks():
1859
+ return "DATABRICKS_RUNTIME_VERSION" in os.environ
1860
+
1861
+
1862
+ def list_reports():
1863
+ for nb in KNOWN_NOTEBOOKS:
1864
+ html_path = DOCS_DIR / f"{nb}.html"
1865
+ status = "available" if html_path.exists() else "missing"
1866
+ print(f" {nb}: {status}")
1867
+
1868
+
1869
+ if __name__ == "__main__":
1870
+ print("Available exploration reports:")
1871
+ list_reports()
1872
+ ''',
1873
+ }
1874
+
1875
+
1876
+ class CodeRenderer:
1877
+ _TEMPLATE_MAP = {
1878
+ "config": "config.py.j2",
1879
+ "silver": "silver.py.j2",
1880
+ "gold": "gold.py.j2",
1881
+ "training": "training.py.j2",
1882
+ "runner": "runner.py.j2",
1883
+ "workflow": "workflow.json.j2",
1884
+ "run_all": "run_all.py.j2",
1885
+ "feast_config": "feature_store.yaml.j2",
1886
+ "feast_features": "features.py.j2",
1887
+
1888
+ "landing": "landing.py.j2",
1889
+ "bronze_event": "bronze_event.py.j2",
1890
+ "validation": "validate.py.j2",
1891
+ "run_validation": "run_validation.py.j2",
1892
+ "exploration_report": "exploration_report.py.j2",
1893
+ }
1894
+
1895
+ def __init__(self):
1896
+ self._env = Environment(loader=InlineLoader(TEMPLATES))
1897
+ self._env.globals["action_description"] = action_description
1898
+ self._env.globals["render_step_call"] = render_step_call
1899
+ self._env.globals["collect_imports"] = collect_imports
1900
+ self._env.globals["group_steps"] = group_steps
1901
+ self._env.globals["provenance_docstring_block"] = provenance_docstring_block
1902
+ self._env.globals["provenance_key"] = provenance_key
1903
+
1904
+ def set_docs_base(self, experiments_dir: str | None) -> None:
1905
+ global _docs_base
1906
+ if experiments_dir:
1907
+ _docs_base = f"file://{Path(experiments_dir).resolve() / 'docs'}"
1908
+ else:
1909
+ _docs_base = "docs"
1910
+
1911
+ def _render(self, template_key: str, **context) -> str:
1912
+ return self._env.get_template(self._TEMPLATE_MAP[template_key]).render(**context)
1913
+
1914
+ def render_config(self, config: PipelineConfig) -> str:
1915
+ return self._render("config", config=config)
1916
+
1917
+ def render_bronze(self, source_name: str, bronze_config: BronzeLayerConfig) -> str:
1918
+ return self._env.get_template("bronze.py.j2").render(source=source_name, config=bronze_config)
1919
+
1920
+ def render_silver(self, config: PipelineConfig) -> str:
1921
+ return self._render("silver", config=config)
1922
+
1923
+ def render_gold(self, config: PipelineConfig) -> str:
1924
+ return self._render("gold", config=config)
1925
+
1926
+ def render_training(self, config: PipelineConfig) -> str:
1927
+ return self._render("training", config=config)
1928
+
1929
+ def render_runner(self, config: PipelineConfig) -> str:
1930
+ return self._render("runner", config=config)
1931
+
1932
+ def render_workflow(self, config: PipelineConfig) -> str:
1933
+ return self._render("workflow", config=config)
1934
+
1935
+ def render_run_all(self, config: PipelineConfig) -> str:
1936
+ return self._render("run_all", config=config)
1937
+
1938
+ def render_feast_config(self, config: PipelineConfig) -> str:
1939
+ return self._render("feast_config", config=config)
1940
+
1941
+ def render_feast_features(self, config: PipelineConfig) -> str:
1942
+ return self._render("feast_features", config=config)
1943
+
1944
+
1945
+ def render_landing(self, name: str, config: LandingLayerConfig) -> str:
1946
+ return self._env.get_template("landing.py.j2").render(name=name, config=config)
1947
+
1948
+ def render_bronze_event(self, source_name: str, config: BronzeEventConfig) -> str:
1949
+ return self._env.get_template("bronze_event.py.j2").render(source=source_name, config=config)
1950
+
1951
+ def render_validation(self, config: PipelineConfig) -> str:
1952
+ return self._render("validation", config=config)
1953
+
1954
+ def render_run_validation(self, config: PipelineConfig) -> str:
1955
+ return self._render("run_validation", config=config)
1956
+
1957
+ def render_exploration_report(self, config: PipelineConfig) -> str:
1958
+ notebooks = set()
1959
+ for bronze in config.bronze.values():
1960
+ for step in bronze.transformations:
1961
+ nb = step.source_notebook or DEFAULT_NOTEBOOK_MAP.get(step.type)
1962
+ if nb:
1963
+ notebooks.add(nb)
1964
+ for step in config.gold.transformations + config.gold.encodings + config.gold.scalings:
1965
+ nb = step.source_notebook or DEFAULT_NOTEBOOK_MAP.get(step.type)
1966
+ if nb:
1967
+ notebooks.add(nb)
1968
+ for step in config.silver.derived_columns:
1969
+ nb = step.source_notebook or DEFAULT_NOTEBOOK_MAP.get(step.type)
1970
+ if nb:
1971
+ notebooks.add(nb)
1972
+ for be in config.bronze_event.values():
1973
+ for step in be.pre_shaping + be.post_shaping:
1974
+ nb = step.source_notebook or DEFAULT_NOTEBOOK_MAP.get(step.type)
1975
+ if nb:
1976
+ notebooks.add(nb)
1977
+ return self._render("exploration_report", notebooks=sorted(notebooks))
1978
+
1979
+
1980
+ _StepMeta = namedtuple("_StepMeta", ["desc_tpl", "call_tpl", "import_name", "param_defaults"])
1981
+
1982
+ _STATELESS_REGISTRY = {
1983
+ PipelineTransformationType.IMPUTE_NULL: _StepMeta(
1984
+ "impute nulls in {col} with {value}",
1985
+ "apply_impute_null(df, '{col}', value='{value}')",
1986
+ "apply_impute_null", {"value": 0}),
1987
+ PipelineTransformationType.CAP_OUTLIER: _StepMeta(
1988
+ "cap outliers in {col} to [{lower}, {upper}]",
1989
+ "apply_cap_outlier(df, '{col}', lower={lower}, upper={upper})",
1990
+ "apply_cap_outlier", {"lower": 0, "upper": 1000000}),
1991
+ PipelineTransformationType.TYPE_CAST: _StepMeta(
1992
+ "cast {col} to {dtype}",
1993
+ "apply_type_cast(df, '{col}', dtype='{dtype}')",
1994
+ "apply_type_cast", {"dtype": "float"}),
1995
+ PipelineTransformationType.DROP_COLUMN: _StepMeta(
1996
+ "drop column {col}",
1997
+ "apply_drop_column(df, '{col}')",
1998
+ "apply_drop_column", {}),
1999
+ PipelineTransformationType.WINSORIZE: _StepMeta(
2000
+ "winsorize {col} to [{lower_bound}, {upper_bound}]",
2001
+ "apply_winsorize(df, '{col}', lower_bound={lower_bound}, upper_bound={upper_bound})",
2002
+ "apply_winsorize", {"lower_bound": 0, "upper_bound": 1000000}),
2003
+ PipelineTransformationType.SEGMENT_AWARE_CAP: _StepMeta(
2004
+ "segment-aware outlier cap on {col} ({n_segments} segments)",
2005
+ "apply_segment_aware_cap(df, '{col}', n_segments={n_segments})",
2006
+ "apply_segment_aware_cap", {"n_segments": 2}),
2007
+ PipelineTransformationType.LOG_TRANSFORM: _StepMeta(
2008
+ "log-transform {col}",
2009
+ "apply_log_transform(df, '{col}')",
2010
+ "apply_log_transform", {}),
2011
+ PipelineTransformationType.SQRT_TRANSFORM: _StepMeta(
2012
+ "sqrt-transform {col}",
2013
+ "apply_sqrt_transform(df, '{col}')",
2014
+ "apply_sqrt_transform", {}),
2015
+ PipelineTransformationType.ZERO_INFLATION_HANDLING: _StepMeta(
2016
+ "handle zero-inflation in {col}",
2017
+ "apply_zero_inflation_handling(df, '{col}')",
2018
+ "apply_zero_inflation_handling", {}),
2019
+ PipelineTransformationType.CAP_THEN_LOG: _StepMeta(
2020
+ "cap at p99 then log-transform {col}",
2021
+ "apply_cap_then_log(df, '{col}')",
2022
+ "apply_cap_then_log", {}),
2023
+ PipelineTransformationType.FEATURE_SELECT: _StepMeta(
2024
+ "drop {col} (feature selection)",
2025
+ "apply_feature_select(df, '{col}')",
2026
+ "apply_feature_select", {}),
2027
+ }
2028
+
2029
+
2030
+ def _extract_params(step, meta):
2031
+ return {k: step.parameters.get(k, v) for k, v in meta.param_defaults.items()}
2032
+
2033
+
2034
+ def action_description(step: TransformationStep) -> str:
2035
+ t, col, p = step.type, step.column, step.parameters
2036
+ meta = _STATELESS_REGISTRY.get(t)
2037
+ if meta is not None:
2038
+ return meta.desc_tpl.format(col=col, **_extract_params(step, meta))
2039
+ if t == PipelineTransformationType.YEO_JOHNSON:
2040
+ return f"yeo-johnson transform {col}"
2041
+ if t == PipelineTransformationType.ENCODE:
2042
+ method = p.get("method", "one_hot")
2043
+ if method in ("one_hot", "onehot"):
2044
+ return f"one-hot encode {col}"
2045
+ return f"label-encode {col}"
2046
+ if t == PipelineTransformationType.SCALE:
2047
+ method = p.get("method", "standard")
2048
+ if method == "minmax":
2049
+ return f"min-max scale {col}"
2050
+ return f"standard-scale {col}"
2051
+ if t == PipelineTransformationType.DERIVED_COLUMN:
2052
+ action = p.get("action", "ratio")
2053
+ if action == "ratio":
2054
+ return f"create {col} = {p.get('numerator', '?')} / {p.get('denominator', '?')}"
2055
+ if action == "interaction":
2056
+ features = p.get("features", [])
2057
+ col_a = features[0] if len(features) > 0 else p.get("col_a", "?")
2058
+ col_b = features[1] if len(features) > 1 else p.get("col_b", "?")
2059
+ return f"create {col} = {col_a} * {col_b}"
2060
+ if action == "composite":
2061
+ return f"create {col} = mean({', '.join(p.get('columns', []))})"
2062
+ return f"transform {col}"
2063
+
2064
+
2065
+ def render_step_call(step: TransformationStep, fit_mode: bool = True) -> str:
2066
+ t, col, p = step.type, step.column, step.parameters
2067
+ meta = _STATELESS_REGISTRY.get(t)
2068
+ if meta is not None:
2069
+ return meta.call_tpl.format(col=col, **_extract_params(step, meta))
2070
+ if t == PipelineTransformationType.YEO_JOHNSON:
2071
+ method = "fit_transform" if fit_mode else "transform"
2072
+ return f"FittedPowerTransform().{method}(df, '{col}', _store)"
2073
+ if t == PipelineTransformationType.ENCODE:
2074
+ method = p.get("method", "one_hot")
2075
+ if method in ("one_hot", "onehot"):
2076
+ return f"apply_one_hot_encode(df, '{col}')"
2077
+ fit_method = "fit_transform" if fit_mode else "transform"
2078
+ return f"FittedEncoder().{fit_method}(df, '{col}', _store)"
2079
+ if t == PipelineTransformationType.SCALE:
2080
+ method = p.get("method", "standard")
2081
+ fit_method = "fit_transform" if fit_mode else "transform"
2082
+ return f"FittedScaler('{method}').{fit_method}(df, '{col}', _store)"
2083
+ if t == PipelineTransformationType.DERIVED_COLUMN:
2084
+ action = p.get("action", "ratio")
2085
+ if action == "ratio":
2086
+ return f"apply_derived_ratio(df, '{col}', numerator='{p.get('numerator', '')}', denominator='{p.get('denominator', '')}')"
2087
+ if action == "interaction":
2088
+ features = p.get("features", [])
2089
+ col_a = features[0] if len(features) > 0 else p.get("col_a", "")
2090
+ col_b = features[1] if len(features) > 1 else p.get("col_b", "")
2091
+ return f"apply_derived_interaction(df, '{col}', col_a='{col_a}', col_b='{col_b}')"
2092
+ if action == "composite":
2093
+ return f"apply_derived_composite(df, '{col}', columns={p.get('columns', [])})"
2094
+ raise ValueError(f"Unknown transformation type: {step.type}")
2095
+
2096
+
2097
+ def collect_imports(steps, include_fitted):
2098
+ ops = set()
2099
+ fitted = set()
2100
+ _OPS_MAP = {k: v.import_name for k, v in _STATELESS_REGISTRY.items()}
2101
+ for step in steps:
2102
+ t, p = step.type, step.parameters
2103
+ if t in _OPS_MAP:
2104
+ ops.add(_OPS_MAP[t])
2105
+ elif t == PipelineTransformationType.ENCODE:
2106
+ method = p.get("method", "one_hot")
2107
+ if method in ("one_hot", "onehot"):
2108
+ ops.add("apply_one_hot_encode")
2109
+ elif include_fitted:
2110
+ fitted.add("FittedEncoder")
2111
+ elif t == PipelineTransformationType.SCALE:
2112
+ if include_fitted:
2113
+ fitted.add("FittedScaler")
2114
+ elif t == PipelineTransformationType.YEO_JOHNSON:
2115
+ if include_fitted:
2116
+ fitted.add("FittedPowerTransform")
2117
+ elif t == PipelineTransformationType.DERIVED_COLUMN:
2118
+ action = p.get("action", "ratio")
2119
+ if action == "ratio":
2120
+ ops.add("apply_derived_ratio")
2121
+ elif action == "interaction":
2122
+ ops.add("apply_derived_interaction")
2123
+ elif action == "composite":
2124
+ ops.add("apply_derived_composite")
2125
+ return ops, fitted