churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,685 @@
1
+ import re
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ from customer_retention.analysis.auto_explorer.findings import ExplorationFindings
7
+ from customer_retention.core.config.column_config import ColumnType
8
+
9
+
10
+ @dataclass
11
+ class CleanAction:
12
+ action_type: str
13
+ strategy: str = ""
14
+ params: Dict[str, Any] = field(default_factory=dict)
15
+
16
+
17
+ @dataclass
18
+ class TransformAction:
19
+ action_type: str
20
+ method: str = ""
21
+ params: Dict[str, Any] = field(default_factory=dict)
22
+
23
+
24
+ class RecommendationParser:
25
+ CLEANING_PATTERNS = {
26
+ r"impute_median": ("impute", "median", {}),
27
+ r"impute_mean": ("impute", "mean", {}),
28
+ r"impute_mode": ("impute", "mode", {}),
29
+ r"impute_zero": ("impute", "constant", {"fill_value": 0}),
30
+ r"impute_constant_(.+)": ("impute", "constant", {}),
31
+ r"cap_outliers_(\d+)": ("cap_outliers", "", {}),
32
+ r"remove_outliers_iqr": ("remove_outliers", "iqr", {}),
33
+ r"drop_rare_(\d+)": ("drop_rare", "", {}),
34
+ r"drop_nulls": ("drop_nulls", "", {}),
35
+ }
36
+
37
+ TRANSFORM_PATTERNS = {
38
+ r"standard_scale": ("scale", "standard", {}),
39
+ r"minmax_scale": ("scale", "minmax", {}),
40
+ r"robust_scale": ("scale", "robust", {}),
41
+ r"log_transform": ("transform", "log1p", {}),
42
+ r"sqrt_transform": ("transform", "sqrt", {}),
43
+ r"power_transform": ("transform", "yeo_johnson", {}),
44
+ r"onehot_encode": ("encode", "onehot", {}),
45
+ r"label_encode": ("encode", "label", {}),
46
+ r"ordinal_encode": ("encode", "ordinal", {}),
47
+ r"extract_month": ("datetime_extract", "month", {}),
48
+ r"extract_dayofweek": ("datetime_extract", "dayofweek", {}),
49
+ r"extract_day$": ("datetime_extract", "day", {}),
50
+ r"extract_hour": ("datetime_extract", "hour", {}),
51
+ r"extract_year": ("datetime_extract", "year", {}),
52
+ r"days_since": ("datetime_extract", "days_since", {}),
53
+ }
54
+
55
+ def parse_cleaning(self, recommendation: str) -> Optional[CleanAction]:
56
+ for pattern, (action_type, strategy, params) in self.CLEANING_PATTERNS.items():
57
+ match = re.match(pattern, recommendation)
58
+ if match:
59
+ result_params = params.copy()
60
+ if match.groups():
61
+ if action_type == "cap_outliers":
62
+ result_params["percentile"] = int(match.group(1))
63
+ elif action_type == "drop_rare":
64
+ result_params["threshold_percent"] = int(match.group(1))
65
+ elif strategy == "constant" and "fill_value" not in result_params:
66
+ result_params["fill_value"] = match.group(1)
67
+ return CleanAction(action_type=action_type, strategy=strategy, params=result_params)
68
+ return None
69
+
70
+ def parse_transform(self, recommendation: str) -> Optional[TransformAction]:
71
+ for pattern, (action_type, method, params) in self.TRANSFORM_PATTERNS.items():
72
+ if re.match(pattern, recommendation):
73
+ return TransformAction(action_type=action_type, method=method, params=params.copy())
74
+ return None
75
+
76
+
77
+ @dataclass
78
+ class MLflowConfig:
79
+ tracking_uri: str = "./mlruns"
80
+ experiment_name: str = "ml_pipeline"
81
+ run_name: Optional[str] = None
82
+ log_data_quality: bool = True
83
+ log_transformations: bool = True
84
+ log_feature_importance: bool = True
85
+ nested_runs: bool = True
86
+ model_name: Optional[str] = None
87
+
88
+
89
+ class MLflowPipelineGenerator:
90
+ def __init__(
91
+ self,
92
+ mlflow_config: Optional[MLflowConfig] = None,
93
+ output_dir: str = "./generated_pipelines",
94
+ ):
95
+ self.mlflow_config = mlflow_config or MLflowConfig()
96
+ self.output_dir = output_dir
97
+ self._parser = RecommendationParser()
98
+
99
+ def generate_pipeline(self, findings: ExplorationFindings) -> str:
100
+ sections = [
101
+ self._generate_docstring(findings),
102
+ self._generate_imports(),
103
+ self._generate_mlflow_setup(),
104
+ ]
105
+
106
+ if self.mlflow_config.log_data_quality:
107
+ sections.append(self._generate_data_quality_logging())
108
+
109
+ sections.extend([
110
+ self.generate_cleaning_functions(findings),
111
+ self.generate_transform_functions(findings),
112
+ self.generate_feature_engineering(findings),
113
+ self.generate_model_training(findings),
114
+ self.generate_monitoring(findings),
115
+ self._generate_main(findings),
116
+ ])
117
+ return "\n\n".join(sections)
118
+
119
+ def _generate_docstring(self, findings: ExplorationFindings) -> str:
120
+ return f'''"""
121
+ MLflow-tracked ML Pipeline
122
+ Generated from exploration findings
123
+
124
+ Source: {findings.source_path}
125
+ Target: {findings.target_column or 'Not specified'}
126
+ Rows: {findings.row_count:,}
127
+ Features: {findings.column_count}
128
+ """'''
129
+
130
+ def _generate_imports(self) -> str:
131
+ return """import pandas as pd
132
+ import numpy as np
133
+ from datetime import datetime
134
+ from typing import Dict, List, Tuple, Any
135
+
136
+ import mlflow
137
+ import mlflow.sklearn
138
+ from sklearn.model_selection import train_test_split, cross_val_score
139
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
140
+ from sklearn.impute import SimpleImputer
141
+ from sklearn.compose import ColumnTransformer
142
+ from sklearn.pipeline import Pipeline
143
+ from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
144
+ from sklearn.linear_model import LogisticRegression
145
+ from sklearn.metrics import (
146
+ accuracy_score, precision_score, recall_score, f1_score,
147
+ roc_auc_score, classification_report, confusion_matrix
148
+ )"""
149
+
150
+ def _generate_mlflow_setup(self) -> str:
151
+ return f'''
152
+ MLFLOW_TRACKING_URI = "{self.mlflow_config.tracking_uri}"
153
+ EXPERIMENT_NAME = "{self.mlflow_config.experiment_name}"
154
+
155
+
156
+ def setup_mlflow():
157
+ """Initialize MLflow tracking."""
158
+ mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
159
+ mlflow.set_experiment(EXPERIMENT_NAME)
160
+ return mlflow.get_experiment_by_name(EXPERIMENT_NAME)'''
161
+
162
+ def _generate_data_quality_logging(self) -> str:
163
+ return '''
164
+ def log_data_quality_metrics(df: pd.DataFrame, prefix: str = "data"):
165
+ """Log data quality metrics to MLflow."""
166
+ metrics = {
167
+ f"{prefix}_rows": len(df),
168
+ f"{prefix}_columns": len(df.columns),
169
+ f"{prefix}_memory_mb": df.memory_usage(deep=True).sum() / 1024 / 1024,
170
+ }
171
+
172
+ for col in df.columns:
173
+ null_pct = df[col].isna().mean() * 100
174
+ metrics[f"{prefix}_null_pct_{col}"] = null_pct
175
+
176
+ mlflow.log_metrics(metrics)
177
+ return metrics'''
178
+
179
+ def generate_cleaning_functions(self, findings: ExplorationFindings) -> str:
180
+ cleaning_steps = self._build_cleaning_steps(findings)
181
+
182
+ code_lines = [
183
+ "def clean_data(df: pd.DataFrame) -> pd.DataFrame:",
184
+ ' """Apply cleaning transformations based on exploration findings."""',
185
+ " df = df.copy()",
186
+ " cleaning_stats = {}",
187
+ "",
188
+ ]
189
+
190
+ if not cleaning_steps:
191
+ code_lines.append(" # No cleaning recommendations found")
192
+ else:
193
+ for col_name, actions in cleaning_steps.items():
194
+ for action in actions:
195
+ code_lines.extend(self._action_to_cleaning_code(col_name, action))
196
+
197
+ code_lines.extend([
198
+ "",
199
+ " mlflow.log_params({f'cleaned_{k}': v for k, v in cleaning_stats.items()})",
200
+ " return df",
201
+ ])
202
+
203
+ return "\n".join(code_lines)
204
+
205
+ def _build_cleaning_steps(self, findings: ExplorationFindings) -> Dict[str, List[CleanAction]]:
206
+ steps = {}
207
+ for col_name, col_finding in findings.columns.items():
208
+ if col_finding.inferred_type in (ColumnType.IDENTIFIER, ColumnType.TARGET):
209
+ continue
210
+
211
+ col_actions = []
212
+ for rec in col_finding.cleaning_recommendations:
213
+ action = self._parser.parse_cleaning(rec)
214
+ if action:
215
+ col_actions.append(action)
216
+
217
+ if col_actions:
218
+ steps[col_name] = col_actions
219
+
220
+ return steps
221
+
222
+ def _action_to_cleaning_code(self, col_name: str, action: CleanAction) -> List[str]:
223
+ lines = []
224
+
225
+ if action.action_type == "impute":
226
+ if action.strategy == "median":
227
+ lines.extend([
228
+ f" # Impute {col_name} with median",
229
+ f" if df['{col_name}'].isna().any():",
230
+ f" median_val = df['{col_name}'].median()",
231
+ f" cleaning_stats['{col_name}_imputed'] = df['{col_name}'].isna().sum()",
232
+ f" df['{col_name}'] = df['{col_name}'].fillna(median_val)",
233
+ "",
234
+ ])
235
+ elif action.strategy == "mode":
236
+ lines.extend([
237
+ f" # Impute {col_name} with mode",
238
+ f" if df['{col_name}'].isna().any():",
239
+ f" mode_val = df['{col_name}'].mode().iloc[0] if not df['{col_name}'].mode().empty else None",
240
+ " if mode_val is not None:",
241
+ f" cleaning_stats['{col_name}_imputed'] = df['{col_name}'].isna().sum()",
242
+ f" df['{col_name}'] = df['{col_name}'].fillna(mode_val)",
243
+ "",
244
+ ])
245
+ elif action.strategy == "constant":
246
+ fill_value = action.params.get("fill_value", 0)
247
+ lines.extend([
248
+ f" # Impute {col_name} with constant",
249
+ f" if df['{col_name}'].isna().any():",
250
+ f" cleaning_stats['{col_name}_imputed'] = df['{col_name}'].isna().sum()",
251
+ f" df['{col_name}'] = df['{col_name}'].fillna({repr(fill_value)})",
252
+ "",
253
+ ])
254
+
255
+ elif action.action_type == "cap_outliers":
256
+ percentile = action.params.get("percentile", 99)
257
+ lines.extend([
258
+ f" # Cap outliers in {col_name} at {percentile}th percentile",
259
+ f" lower = df['{col_name}'].quantile({(100 - percentile) / 100})",
260
+ f" upper = df['{col_name}'].quantile({percentile / 100})",
261
+ f" outliers = ((df['{col_name}'] < lower) | (df['{col_name}'] > upper)).sum()",
262
+ f" cleaning_stats['{col_name}_outliers_capped'] = outliers",
263
+ f" df['{col_name}'] = df['{col_name}'].clip(lower, upper)",
264
+ "",
265
+ ])
266
+
267
+ elif action.action_type == "drop_rare":
268
+ threshold = action.params.get("threshold_percent", 5)
269
+ lines.extend([
270
+ f" # Drop rare categories in {col_name} (< {threshold}%)",
271
+ f" value_counts = df['{col_name}'].value_counts(normalize=True)",
272
+ f" rare_values = value_counts[value_counts < {threshold / 100}].index",
273
+ " if len(rare_values) > 0:",
274
+ f" cleaning_stats['{col_name}_rare_dropped'] = len(rare_values)",
275
+ f" df.loc[df['{col_name}'].isin(rare_values), '{col_name}'] = df['{col_name}'].mode().iloc[0]",
276
+ "",
277
+ ])
278
+
279
+ return lines
280
+
281
+ def generate_transform_functions(self, findings: ExplorationFindings) -> str:
282
+ self._get_columns_by_type(findings,
283
+ [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE])
284
+ self._get_columns_by_type(findings,
285
+ [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL])
286
+
287
+ transform_actions = self._build_transform_actions(findings)
288
+
289
+ code_lines = [
290
+ "def apply_transforms(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Any]]:",
291
+ ' """Apply transformations based on exploration recommendations."""',
292
+ " df = df.copy()",
293
+ " transformers = {}",
294
+ "",
295
+ ]
296
+
297
+ # Log transform for skewed columns
298
+ log_cols = [col for col, actions in transform_actions.items()
299
+ if any(a.method == "log1p" for a in actions)]
300
+ if log_cols:
301
+ for col in log_cols:
302
+ code_lines.extend([
303
+ f" # Log transform {col} (recommended for skewness)",
304
+ f" df['{col}_log'] = np.log1p(df['{col}'].clip(lower=0))",
305
+ f" transformers['{col}_log_transform'] = True",
306
+ "",
307
+ ])
308
+
309
+ # Standard scaling
310
+ scale_standard = [col for col, actions in transform_actions.items()
311
+ if any(a.action_type == "scale" and a.method == "standard" for a in actions)]
312
+ if scale_standard:
313
+ code_lines.extend([
314
+ " # Standard scaling",
315
+ f" standard_cols = {scale_standard}",
316
+ " if standard_cols:",
317
+ " scaler = StandardScaler()",
318
+ " df[standard_cols] = scaler.fit_transform(df[standard_cols])",
319
+ " transformers['standard_scaler'] = {'columns': standard_cols}",
320
+ "",
321
+ ])
322
+
323
+ # MinMax scaling
324
+ scale_minmax = [col for col, actions in transform_actions.items()
325
+ if any(a.action_type == "scale" and a.method == "minmax" for a in actions)]
326
+ if scale_minmax:
327
+ code_lines.extend([
328
+ " # MinMax scaling",
329
+ f" minmax_cols = {scale_minmax}",
330
+ " if minmax_cols:",
331
+ " minmax_scaler = MinMaxScaler()",
332
+ " df[minmax_cols] = minmax_scaler.fit_transform(df[minmax_cols])",
333
+ " transformers['minmax_scaler'] = {'columns': minmax_cols}",
334
+ "",
335
+ ])
336
+
337
+ # One-hot encoding
338
+ onehot_cols = [col for col, actions in transform_actions.items()
339
+ if any(a.action_type == "encode" and a.method == "onehot" for a in actions)]
340
+ if onehot_cols:
341
+ code_lines.extend([
342
+ " # One-hot encoding",
343
+ f" onehot_cols = {onehot_cols}",
344
+ " for col in onehot_cols:",
345
+ " dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)",
346
+ " df = pd.concat([df.drop(columns=[col]), dummies], axis=1)",
347
+ " transformers[f'{col}_onehot'] = list(dummies.columns)",
348
+ "",
349
+ ])
350
+
351
+ # Label encoding
352
+ label_cols = [col for col, actions in transform_actions.items()
353
+ if any(a.action_type == "encode" and a.method == "label" for a in actions)]
354
+ if label_cols:
355
+ code_lines.extend([
356
+ " # Label encoding",
357
+ f" label_cols = {label_cols}",
358
+ " label_encoders = {{}}",
359
+ " for col in label_cols:",
360
+ " le = LabelEncoder()",
361
+ " df[col] = le.fit_transform(df[col].astype(str))",
362
+ " label_encoders[col] = le",
363
+ " transformers['label_encoders'] = label_encoders",
364
+ "",
365
+ ])
366
+
367
+ code_lines.extend([
368
+ " mlflow.log_params({f'transform_{k}': str(v)[:250] for k, v in transformers.items()})",
369
+ " return df, transformers",
370
+ ])
371
+
372
+ return "\n".join(code_lines)
373
+
374
+ def _build_transform_actions(self, findings: ExplorationFindings) -> Dict[str, List[TransformAction]]:
375
+ actions = {}
376
+ for col_name, col_finding in findings.columns.items():
377
+ if col_finding.inferred_type in (ColumnType.IDENTIFIER, ColumnType.TARGET):
378
+ continue
379
+
380
+ col_actions = []
381
+ for rec in col_finding.transformation_recommendations:
382
+ action = self._parser.parse_transform(rec)
383
+ if action:
384
+ col_actions.append(action)
385
+
386
+ if col_actions:
387
+ actions[col_name] = col_actions
388
+
389
+ return actions
390
+
391
+ def generate_feature_engineering(self, findings: ExplorationFindings) -> str:
392
+ datetime_cols = self._get_columns_by_type(findings, [ColumnType.DATETIME])
393
+ transform_actions = self._build_transform_actions(findings)
394
+
395
+ code_lines = [
396
+ "def engineer_features(df: pd.DataFrame) -> pd.DataFrame:",
397
+ ' """Engineer features based on exploration findings."""',
398
+ " df = df.copy()",
399
+ " new_features = []",
400
+ "",
401
+ ]
402
+
403
+ # Datetime feature extraction
404
+ for col_name in datetime_cols:
405
+ actions = transform_actions.get(col_name, [])
406
+ extract_types = [a.method for a in actions if a.action_type == "datetime_extract"]
407
+
408
+ if not extract_types:
409
+ extract_types = ["month", "dayofweek", "days_since"]
410
+
411
+ code_lines.extend([
412
+ f" # Datetime features from {col_name}",
413
+ f" if '{col_name}' in df.columns:",
414
+ f" df['{col_name}'] = pd.to_datetime(df['{col_name}'], errors='coerce')",
415
+ "",
416
+ ])
417
+
418
+ for ext_type in extract_types:
419
+ if ext_type == "month":
420
+ code_lines.append(f" df['{col_name}_month'] = df['{col_name}'].dt.month")
421
+ code_lines.append(f" new_features.append('{col_name}_month')")
422
+ elif ext_type == "day":
423
+ code_lines.append(f" df['{col_name}_day'] = df['{col_name}'].dt.day")
424
+ code_lines.append(f" new_features.append('{col_name}_day')")
425
+ elif ext_type == "dayofweek":
426
+ code_lines.append(f" df['{col_name}_dayofweek'] = df['{col_name}'].dt.dayofweek")
427
+ code_lines.append(f" new_features.append('{col_name}_dayofweek')")
428
+ elif ext_type == "hour":
429
+ code_lines.append(f" df['{col_name}_hour'] = df['{col_name}'].dt.hour")
430
+ code_lines.append(f" new_features.append('{col_name}_hour')")
431
+ elif ext_type == "year":
432
+ code_lines.append(f" df['{col_name}_year'] = df['{col_name}'].dt.year")
433
+ code_lines.append(f" new_features.append('{col_name}_year')")
434
+ elif ext_type == "days_since":
435
+ code_lines.extend([
436
+ f" reference_date = df['{col_name}'].max()",
437
+ f" df['{col_name}_days_since'] = (reference_date - df['{col_name}']).dt.days",
438
+ f" new_features.append('{col_name}_days_since')",
439
+ ])
440
+
441
+ code_lines.append("")
442
+
443
+ code_lines.extend([
444
+ " if new_features:",
445
+ " mlflow.log_param('engineered_features', new_features)",
446
+ " return df",
447
+ ])
448
+
449
+ return "\n".join(code_lines)
450
+
451
+ def generate_model_training(self, findings: ExplorationFindings) -> str:
452
+ target = findings.target_column or "target"
453
+ identifier_cols = findings.identifier_columns or []
454
+ datetime_cols = findings.datetime_columns or []
455
+ exclude_cols = set(identifier_cols + datetime_cols + [target])
456
+
457
+ return f'''
458
+ def train_model(
459
+ df: pd.DataFrame,
460
+ target_column: str = "{target}",
461
+ test_size: float = 0.2,
462
+ val_size: float = 0.1,
463
+ ) -> Dict[str, Any]:
464
+ """Train model with comprehensive MLflow tracking."""
465
+
466
+ # Exclude non-feature columns
467
+ exclude_cols = {exclude_cols}
468
+ feature_cols = [col for col in df.columns if col not in exclude_cols and col != target_column]
469
+
470
+ # Handle non-numeric columns
471
+ X = df[feature_cols].copy()
472
+ for col in X.select_dtypes(include=['object', 'category']).columns:
473
+ X[col] = pd.factorize(X[col])[0]
474
+ X = X.fillna(0)
475
+
476
+ y = df[target_column]
477
+
478
+ # Split: train/validation/test
479
+ X_temp, X_test, y_temp, y_test = train_test_split(
480
+ X, y, test_size=test_size, random_state=42, stratify=y
481
+ )
482
+ X_train, X_val, y_train, y_val = train_test_split(
483
+ X_temp, y_temp, test_size=val_size/(1-test_size), random_state=42, stratify=y_temp
484
+ )
485
+
486
+ mlflow.log_params({{
487
+ "train_samples": len(X_train),
488
+ "validation_samples": len(X_val),
489
+ "test_samples": len(X_test),
490
+ "feature_count": len(feature_cols),
491
+ "test_size": test_size,
492
+ "val_size": val_size,
493
+ }})
494
+
495
+ # Train models
496
+ models = {{
497
+ "logistic_regression": LogisticRegression(max_iter=1000, random_state=42),
498
+ "random_forest": RandomForestClassifier(n_estimators=100, random_state=42),
499
+ "gradient_boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
500
+ }}
501
+
502
+ results = {{}}
503
+ best_model = None
504
+ best_auc = 0
505
+
506
+ for name, model in models.items():
507
+ with mlflow.start_run(run_name=name, nested=True):
508
+ # Train
509
+ model.fit(X_train, y_train)
510
+
511
+ # Validation predictions
512
+ y_val_pred = model.predict(X_val)
513
+ y_val_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else y_val_pred
514
+
515
+ # Test predictions
516
+ y_test_pred = model.predict(X_test)
517
+ y_test_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_test_pred
518
+
519
+ # Calculate metrics
520
+ val_metrics = {{
521
+ "val_accuracy": accuracy_score(y_val, y_val_pred),
522
+ "val_precision": precision_score(y_val, y_val_pred, average="weighted", zero_division=0),
523
+ "val_recall": recall_score(y_val, y_val_pred, average="weighted", zero_division=0),
524
+ "val_f1": f1_score(y_val, y_val_pred, average="weighted", zero_division=0),
525
+ "val_roc_auc": roc_auc_score(y_val, y_val_proba) if len(np.unique(y_val)) > 1 else 0,
526
+ }}
527
+
528
+ test_metrics = {{
529
+ "test_accuracy": accuracy_score(y_test, y_test_pred),
530
+ "test_precision": precision_score(y_test, y_test_pred, average="weighted", zero_division=0),
531
+ "test_recall": recall_score(y_test, y_test_pred, average="weighted", zero_division=0),
532
+ "test_f1": f1_score(y_test, y_test_pred, average="weighted", zero_division=0),
533
+ "test_roc_auc": roc_auc_score(y_test, y_test_proba) if len(np.unique(y_test)) > 1 else 0,
534
+ }}
535
+
536
+ # Cross-validation
537
+ cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="roc_auc")
538
+ cv_metrics = {{
539
+ "cv_roc_auc_mean": cv_scores.mean(),
540
+ "cv_roc_auc_std": cv_scores.std(),
541
+ }}
542
+
543
+ # Log everything
544
+ mlflow.log_params(model.get_params())
545
+ mlflow.log_metrics({{**val_metrics, **test_metrics, **cv_metrics}})
546
+ mlflow.sklearn.log_model(model, f"model_{{name}}")
547
+
548
+ results[name] = {{
549
+ "model": model,
550
+ "val_metrics": val_metrics,
551
+ "test_metrics": test_metrics,
552
+ "cv_metrics": cv_metrics,
553
+ }}
554
+
555
+ if val_metrics["val_roc_auc"] > best_auc:
556
+ best_auc = val_metrics["val_roc_auc"]
557
+ best_model = name
558
+
559
+ mlflow.log_param("best_model", best_model)
560
+ mlflow.log_metric("best_val_roc_auc", best_auc)
561
+
562
+ return {{"results": results, "best_model": best_model}}'''
563
+
564
+ def generate_monitoring(self, findings: ExplorationFindings) -> str:
565
+ return '''
566
+ def evaluate_model(model, X_test: pd.DataFrame, y_test: pd.Series) -> Dict[str, float]:
567
+ """Evaluate model and log monitoring metrics."""
568
+ y_pred = model.predict(X_test)
569
+ y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
570
+
571
+ metrics = {
572
+ "accuracy": accuracy_score(y_test, y_pred),
573
+ "precision": precision_score(y_test, y_pred, average="weighted", zero_division=0),
574
+ "recall": recall_score(y_test, y_pred, average="weighted", zero_division=0),
575
+ "f1": f1_score(y_test, y_pred, average="weighted", zero_division=0),
576
+ "roc_auc": roc_auc_score(y_test, y_proba) if len(np.unique(y_test)) > 1 else 0,
577
+ }
578
+
579
+ mlflow.log_metrics({f"monitor_{k}": v for k, v in metrics.items()})
580
+
581
+ return metrics'''
582
+
583
+ def _generate_main(self, findings: ExplorationFindings) -> str:
584
+ source_path = findings.source_path
585
+ if findings.source_format == "csv":
586
+ load_expr = f'pd.read_csv("{source_path}")'
587
+ else:
588
+ load_expr = (
589
+ f'get_delta(force_local=True).read("{source_path}") '
590
+ f'if Path("{source_path}").is_dir() and (Path("{source_path}") / "_delta_log").is_dir() '
591
+ f'else pd.read_parquet("{source_path}")'
592
+ )
593
+
594
+ main_body = f'''
595
+ def main():
596
+ """Run the complete ML pipeline with MLflow tracking."""
597
+ from pathlib import Path
598
+ from customer_retention.integrations.adapters.factory import get_delta
599
+ setup_mlflow()
600
+
601
+ with mlflow.start_run(run_name="full_pipeline"):
602
+ # Load data
603
+ print("Loading data...")
604
+ df = {load_expr}'''
605
+
606
+ if self.mlflow_config.log_data_quality:
607
+ main_body += "\n log_data_quality_metrics(df, prefix='raw')"
608
+
609
+ main_body += '''
610
+
611
+ # Clean data
612
+ print("Cleaning data...")
613
+ df = clean_data(df)'''
614
+
615
+ if self.mlflow_config.log_data_quality:
616
+ main_body += "\n log_data_quality_metrics(df, prefix='cleaned')"
617
+
618
+ main_body += '''
619
+
620
+ # Apply transformations
621
+ print("Applying transformations...")
622
+ df, transformers = apply_transforms(df)
623
+
624
+ # Engineer features
625
+ print("Engineering features...")
626
+ df = engineer_features(df)'''
627
+
628
+ if self.mlflow_config.log_data_quality:
629
+ main_body += "\n log_data_quality_metrics(df, prefix='final')"
630
+
631
+ main_body += '''
632
+
633
+ # Train models
634
+ print("Training models...")
635
+ results = train_model(df)
636
+
637
+ print(f"\\nBest model: {results['best_model']}")
638
+ print("Pipeline complete! Check MLflow UI for results.")
639
+
640
+ return results
641
+
642
+
643
+ if __name__ == "__main__":
644
+ main()'''
645
+
646
+ return main_body
647
+
648
+ def _get_columns_by_type(
649
+ self,
650
+ findings: ExplorationFindings,
651
+ col_types: List[ColumnType],
652
+ ) -> List[str]:
653
+ return [
654
+ name for name, col in findings.columns.items()
655
+ if col.inferred_type in col_types
656
+ ]
657
+
658
+ def generate_all(self, findings: ExplorationFindings) -> Dict[str, str]:
659
+ return {
660
+ "pipeline.py": self.generate_pipeline(findings),
661
+ "requirements.txt": self._generate_requirements(),
662
+ }
663
+
664
+ def _generate_requirements(self) -> str:
665
+ return """pandas>=2.0.0
666
+ numpy>=1.24.0
667
+ scikit-learn>=1.3.0
668
+ mlflow>=2.10.0
669
+ scipy>=1.11.0
670
+ matplotlib>=3.7.0
671
+ seaborn>=0.12.0
672
+ """
673
+
674
+ def save_all(self, findings: ExplorationFindings) -> List[str]:
675
+ files = self.generate_all(findings)
676
+ output_path = Path(self.output_dir)
677
+ output_path.mkdir(parents=True, exist_ok=True)
678
+
679
+ saved = []
680
+ for filename, content in files.items():
681
+ file_path = output_path / filename
682
+ file_path.write_text(content)
683
+ saved.append(filename)
684
+
685
+ return saved