churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,727 @@
1
+ from pathlib import Path
2
+ from typing import Dict, List, Optional, Set, Tuple
3
+
4
+ import yaml
5
+
6
+ from customer_retention.analysis.auto_explorer.exploration_manager import (
7
+ DatasetInfo,
8
+ DatasetRelationshipInfo,
9
+ MultiDatasetFindings,
10
+ )
11
+ from customer_retention.analysis.auto_explorer.findings import ExplorationFindings
12
+ from customer_retention.analysis.auto_explorer.layered_recommendations import RecommendationRegistry
13
+
14
+ from .models import (
15
+ AggregationWindowConfig,
16
+ BronzeEventConfig,
17
+ BronzeLayerConfig,
18
+ GoldLayerConfig,
19
+ LabelTimestampConfig,
20
+ LandingLayerConfig,
21
+ LifecycleConfig,
22
+ PipelineConfig,
23
+ PipelineTransformationType,
24
+ SilverLayerConfig,
25
+ SourceConfig,
26
+ TimestampCoalesceConfig,
27
+ TransformationStep,
28
+ )
29
+
30
+
31
+ def _resolve_col_type(col_finding) -> str:
32
+ col_type = col_finding.inferred_type
33
+ if hasattr(col_type, 'value'):
34
+ col_type = col_type.value
35
+ return col_type
36
+
37
+
38
+ class FindingsParser:
39
+ def __init__(self, findings_dir: str):
40
+ self._findings_dir = Path(findings_dir)
41
+ self._source_findings_paths: Dict[str, Path] = {}
42
+
43
+ def parse(self) -> PipelineConfig:
44
+ multi_dataset = self._load_multi_dataset_findings()
45
+ selected_sources = list(multi_dataset.datasets.keys())
46
+ source_findings = self._load_source_findings(selected_sources, self._findings_dir, multi_dataset)
47
+ discovered_events = self._discover_event_sources(source_findings)
48
+ recommendations_registry = self._load_recommendations()
49
+ recommendations_hash = recommendations_registry.compute_recommendations_hash() if recommendations_registry else None
50
+ config = self._build_pipeline_config(multi_dataset, source_findings, recommendations_hash)
51
+ if recommendations_registry:
52
+ self._apply_recommendations_to_config(config, recommendations_registry, multi_dataset)
53
+ self._build_landing_configs(config, multi_dataset, source_findings)
54
+ self._build_discovered_landing_configs(config, discovered_events, multi_dataset)
55
+ self._build_bronze_event_configs(config, multi_dataset, source_findings, discovered_events)
56
+ self._reconcile_discovered_event_transforms(config, discovered_events)
57
+ return config
58
+
59
+ def _load_recommendations(self) -> Optional[RecommendationRegistry]:
60
+ recommendations_path = None
61
+ pattern_matches = list(self._findings_dir.glob("*_recommendations.yaml"))
62
+ if pattern_matches:
63
+ recommendations_path = max(pattern_matches, key=lambda p: p.stat().st_mtime)
64
+ elif (self._findings_dir / "recommendations.yaml").exists():
65
+ recommendations_path = self._findings_dir / "recommendations.yaml"
66
+ if recommendations_path and recommendations_path.exists():
67
+ with open(recommendations_path) as f:
68
+ return RecommendationRegistry.from_dict(yaml.safe_load(f))
69
+ return None
70
+
71
+ def _load_multi_dataset_findings(self) -> MultiDatasetFindings:
72
+ path = self._findings_dir / "multi_dataset_findings.yaml"
73
+ if not path.exists():
74
+ return self._synthesize_from_single_source()
75
+ with open(path) as f:
76
+ data = yaml.safe_load(f)
77
+ return self._dict_to_multi_dataset_findings(data)
78
+
79
+ def _synthesize_from_single_source(self) -> MultiDatasetFindings:
80
+ from customer_retention.core.config.column_config import DatasetGranularity
81
+
82
+ candidates = [
83
+ p for p in self._findings_dir.glob("*_findings.yaml")
84
+ if p.name != "multi_dataset_findings.yaml"
85
+ ]
86
+ if not candidates:
87
+ raise FileNotFoundError(
88
+ f"No findings files found in {self._findings_dir}"
89
+ )
90
+
91
+ datasets = {}
92
+ first_name = None
93
+ for path in candidates:
94
+ findings = ExplorationFindings.load(str(path))
95
+ name = path.stem.replace("_findings", "")
96
+ if first_name is None:
97
+ first_name = name
98
+ datasets[name] = DatasetInfo(
99
+ name=name,
100
+ findings_path=str(path),
101
+ source_path=findings.source_path,
102
+ granularity=DatasetGranularity.ENTITY_LEVEL,
103
+ row_count=findings.row_count,
104
+ column_count=findings.column_count,
105
+ entity_column=(
106
+ findings.identifier_columns[0]
107
+ if findings.identifier_columns
108
+ else None
109
+ ),
110
+ target_column=findings.target_column,
111
+ )
112
+
113
+ return MultiDatasetFindings(
114
+ datasets=datasets,
115
+ primary_entity_dataset=first_name,
116
+ )
117
+
118
+ def _dict_to_multi_dataset_findings(self, data: Dict) -> MultiDatasetFindings:
119
+ from customer_retention.core.config.column_config import DatasetGranularity
120
+ datasets = {}
121
+ for name, info in data.get("datasets", {}).items():
122
+ granularity_str = info.get("granularity", "unknown")
123
+ granularity = DatasetGranularity(granularity_str) if granularity_str else DatasetGranularity.UNKNOWN
124
+ datasets[name] = DatasetInfo(
125
+ name=info["name"],
126
+ findings_path=info.get("findings_path", ""),
127
+ source_path=info.get("source_path", ""),
128
+ granularity=granularity,
129
+ row_count=info.get("row_count", 0),
130
+ column_count=info.get("column_count", 0),
131
+ entity_column=info.get("entity_column"),
132
+ time_column=info.get("time_column"),
133
+ target_column=info.get("target_column"),
134
+ excluded=info.get("excluded", False)
135
+ )
136
+ relationships = [
137
+ DatasetRelationshipInfo(
138
+ left_dataset=r["left_dataset"],
139
+ right_dataset=r["right_dataset"],
140
+ left_column=r["left_column"],
141
+ right_column=r["right_column"],
142
+ relationship_type=r.get("relationship_type", "one_to_many"),
143
+ confidence=r.get("confidence", 1.0),
144
+ auto_detected=r.get("auto_detected", False)
145
+ )
146
+ for r in data.get("relationships", [])
147
+ ]
148
+ return MultiDatasetFindings(
149
+ datasets=datasets,
150
+ relationships=relationships,
151
+ primary_entity_dataset=data.get("primary_entity_dataset"),
152
+ event_datasets=data.get("event_datasets", []),
153
+ excluded_datasets=data.get("excluded_datasets", []),
154
+ aggregation_windows=data.get("aggregation_windows", ["24h", "7d", "30d", "90d", "180d", "365d", "all_time"]),
155
+ notes=data.get("notes", {}),
156
+ )
157
+
158
+ def _load_source_findings(self, sources: List[str], findings_dir: Path, multi_dataset: MultiDatasetFindings = None) -> Dict[str, ExplorationFindings]:
159
+ result = {}
160
+ for name in sources:
161
+ path = None
162
+ if multi_dataset and name in multi_dataset.datasets:
163
+ dataset_info = multi_dataset.datasets[name]
164
+ if dataset_info.findings_path:
165
+ raw_path = Path(dataset_info.findings_path)
166
+ if raw_path.is_absolute():
167
+ path = raw_path
168
+ else:
169
+ path = (findings_dir / raw_path).resolve()
170
+ if not path.exists():
171
+ path = findings_dir / raw_path.name
172
+ if path is None or not path.exists():
173
+ candidates = list(findings_dir.glob(f"{name}_*_findings.yaml"))
174
+ if candidates:
175
+ path = candidates[0]
176
+ else:
177
+ path = findings_dir / f"{name}_findings.yaml"
178
+ if path.exists():
179
+ result[name] = ExplorationFindings.load(str(path))
180
+ self._source_findings_paths[name] = path.resolve()
181
+ return result
182
+
183
+ def _build_pipeline_config(self, multi: MultiDatasetFindings, sources: Dict[str, ExplorationFindings], recommendations_hash: Optional[str] = None) -> PipelineConfig:
184
+ source_configs = self._build_source_configs(multi, sources)
185
+ bronze_configs = self._build_bronze_configs(sources, source_configs)
186
+ return PipelineConfig(
187
+ name="",
188
+ target_column=self._find_target_column(sources),
189
+ sources=source_configs,
190
+ bronze=bronze_configs,
191
+ silver=self._build_silver_config(multi, sources),
192
+ gold=self._build_gold_config(sources),
193
+ output_dir="",
194
+ recommendations_hash=recommendations_hash,
195
+ )
196
+
197
+ def _build_source_configs(self, multi: MultiDatasetFindings, sources: Dict[str, ExplorationFindings]) -> List[SourceConfig]:
198
+ result = []
199
+ for name, findings in sources.items():
200
+ dataset_info = multi.datasets.get(name)
201
+ is_event = name in multi.event_datasets
202
+ is_excluded = name in multi.excluded_datasets or (dataset_info and dataset_info.excluded)
203
+ raw_source = str(Path(dataset_info.source_path if dataset_info else findings.source_path).resolve())
204
+ time_col = None
205
+ entity_key = findings.identifier_columns[0] if findings.identifier_columns else "id"
206
+ if is_event and findings.time_series_metadata:
207
+ time_col = findings.time_series_metadata.time_column
208
+ if findings.time_series_metadata.entity_column:
209
+ entity_key = findings.time_series_metadata.entity_column
210
+ result.append(SourceConfig(
211
+ name=name,
212
+ path=Path(findings.source_path).name,
213
+ format=findings.source_format,
214
+ entity_key=entity_key,
215
+ raw_source_path=raw_source,
216
+ time_column=time_col,
217
+ is_event_level=is_event,
218
+ excluded=is_excluded
219
+ ))
220
+ return result
221
+
222
+ def _build_bronze_configs(self, sources: Dict[str, ExplorationFindings], source_configs: List[SourceConfig]) -> Dict[str, BronzeLayerConfig]:
223
+ result = {}
224
+ source_map = {s.name: s for s in source_configs}
225
+ for name, findings in sources.items():
226
+ source_cfg = source_map[name]
227
+ if source_cfg.is_event_level:
228
+ continue
229
+ result[name] = BronzeLayerConfig(source=source_cfg, transformations=self._extract_transformations(findings))
230
+ return result
231
+
232
+ def _extract_transformations(self, findings: ExplorationFindings) -> List[TransformationStep]:
233
+ transformations = []
234
+ for col_name, col_finding in findings.columns.items():
235
+ if not col_finding.cleaning_needed:
236
+ continue
237
+ for rec in col_finding.cleaning_recommendations:
238
+ step = self._parse_cleaning_recommendation(col_name, rec)
239
+ if step:
240
+ transformations.append(step)
241
+ return transformations
242
+
243
+ def _parse_cleaning_recommendation(self, column: str, recommendation: str) -> TransformationStep:
244
+ if ":" in recommendation:
245
+ action, param = recommendation.split(":", 1)
246
+ else:
247
+ action, param = recommendation, ""
248
+ if action == "impute_null":
249
+ return TransformationStep(
250
+ type=PipelineTransformationType.IMPUTE_NULL,
251
+ column=column,
252
+ parameters={"value": param if param else 0},
253
+ rationale=f"Impute nulls in {column}"
254
+ )
255
+ if action == "cap_outlier":
256
+ return TransformationStep(
257
+ type=PipelineTransformationType.CAP_OUTLIER,
258
+ column=column,
259
+ parameters={"method": param if param else "iqr"},
260
+ rationale=f"Cap outliers in {column}"
261
+ )
262
+ return None
263
+
264
+ def _build_silver_config(self, multi: MultiDatasetFindings, sources: Dict[str, ExplorationFindings]) -> SilverLayerConfig:
265
+ joins = []
266
+ for rel in multi.relationships:
267
+ joins.append({
268
+ "left_key": rel.left_column,
269
+ "right_key": rel.right_column,
270
+ "right_source": rel.right_dataset,
271
+ "how": "left"
272
+ })
273
+ return SilverLayerConfig(joins=joins, aggregations=[])
274
+
275
+ def _build_gold_config(self, sources: Dict[str, ExplorationFindings]) -> GoldLayerConfig:
276
+ encodings = []
277
+ scalings = []
278
+ for findings in sources.values():
279
+ for col_name, col_finding in findings.columns.items():
280
+ col_type = _resolve_col_type(col_finding)
281
+ if col_type == "categorical":
282
+ encodings.append(TransformationStep(
283
+ type=PipelineTransformationType.ENCODE,
284
+ column=col_name,
285
+ parameters={"method": "one_hot"},
286
+ rationale=f"One-hot encode {col_name}"
287
+ ))
288
+ elif col_type == "numeric":
289
+ scalings.append(TransformationStep(
290
+ type=PipelineTransformationType.SCALE,
291
+ column=col_name,
292
+ parameters={"method": "standard"},
293
+ rationale=f"Standardize {col_name}"
294
+ ))
295
+ return GoldLayerConfig(encodings=encodings, scalings=scalings)
296
+
297
+ def _find_target_column(self, sources: Dict[str, ExplorationFindings]) -> str:
298
+ for findings in sources.values():
299
+ if findings.target_column:
300
+ return findings.target_column
301
+ return "target"
302
+
303
+ def _apply_recommendations_to_config(self, config: PipelineConfig, registry: RecommendationRegistry, multi: MultiDatasetFindings) -> None:
304
+ self._apply_bronze_recommendations(config, registry)
305
+ self._apply_silver_recommendations(config, registry)
306
+ self._apply_gold_recommendations(config, registry)
307
+
308
+ def _apply_bronze_recommendations(self, config: PipelineConfig, registry: RecommendationRegistry) -> None:
309
+ sources_to_process = dict(registry.sources)
310
+ if not sources_to_process and hasattr(registry, 'bronze') and registry.bronze is not None:
311
+ sources_to_process = {"_default": registry.bronze}
312
+ for source_name, bronze_recs in sources_to_process.items():
313
+ target_bronze = self._find_bronze_config_for_source(config, source_name, bronze_recs.source_file)
314
+ if target_bronze is None:
315
+ continue
316
+ for rec in bronze_recs.null_handling:
317
+ step = self._map_bronze_null(rec)
318
+ if step:
319
+ target_bronze.transformations.append(step)
320
+ for rec in bronze_recs.outlier_handling:
321
+ step = self._map_bronze_outlier(rec)
322
+ if step:
323
+ target_bronze.transformations.append(step)
324
+ target_bronze.transformations = self._deduplicate_steps(target_bronze.transformations)
325
+
326
+ @staticmethod
327
+ def _deduplicate_steps(steps: List[TransformationStep]) -> List[TransformationStep]:
328
+ seen: Set[Tuple[PipelineTransformationType, str]] = set()
329
+ result: List[TransformationStep] = []
330
+ for step in steps:
331
+ key = (step.type, step.column)
332
+ if key not in seen:
333
+ seen.add(key)
334
+ result.append(step)
335
+ return result
336
+
337
+ def _find_bronze_config_for_source(self, config: PipelineConfig, source_name: str, source_file: str) -> Optional[BronzeLayerConfig]:
338
+ if source_name in config.bronze:
339
+ return config.bronze[source_name]
340
+ source_path = Path(source_file) if source_file else None
341
+ for name, bronze in config.bronze.items():
342
+ if source_path and Path(bronze.source.path).name == source_path.name:
343
+ return bronze
344
+ if source_path and Path(bronze.source.raw_source_path).name == source_path.name:
345
+ return bronze
346
+ if len(config.bronze) == 1:
347
+ return next(iter(config.bronze.values()))
348
+ return None
349
+
350
+ def _map_bronze_null(self, rec) -> Optional[TransformationStep]:
351
+ strategy = rec.parameters.get("strategy", "median")
352
+ if strategy == "drop":
353
+ return TransformationStep(
354
+ type=PipelineTransformationType.DROP_COLUMN,
355
+ column=rec.target_column,
356
+ parameters={"strategy": "drop"},
357
+ rationale=rec.rationale,
358
+ source_notebook=rec.source_notebook,
359
+ )
360
+ return TransformationStep(
361
+ type=PipelineTransformationType.IMPUTE_NULL,
362
+ column=rec.target_column,
363
+ parameters={"value": strategy},
364
+ rationale=rec.rationale,
365
+ source_notebook=rec.source_notebook,
366
+ )
367
+
368
+ def _map_bronze_outlier(self, rec) -> Optional[TransformationStep]:
369
+ if rec.action == "segment_aware_cap":
370
+ return TransformationStep(
371
+ type=PipelineTransformationType.SEGMENT_AWARE_CAP,
372
+ column=rec.target_column,
373
+ parameters={
374
+ "method": rec.parameters.get("method", "segment_iqr"),
375
+ "n_segments": rec.parameters.get("n_segments", 2),
376
+ },
377
+ rationale=rec.rationale,
378
+ source_notebook=rec.source_notebook,
379
+ )
380
+ if rec.action == "winsorize":
381
+ return TransformationStep(
382
+ type=PipelineTransformationType.WINSORIZE,
383
+ column=rec.target_column,
384
+ parameters={
385
+ "lower_bound": rec.parameters.get("lower_bound", 0),
386
+ "upper_bound": rec.parameters.get("upper_bound", 1000000),
387
+ },
388
+ rationale=rec.rationale,
389
+ source_notebook=rec.source_notebook,
390
+ )
391
+ return TransformationStep(
392
+ type=PipelineTransformationType.CAP_OUTLIER,
393
+ column=rec.target_column,
394
+ parameters={"method": rec.parameters.get("method", "iqr")},
395
+ rationale=rec.rationale,
396
+ source_notebook=rec.source_notebook,
397
+ )
398
+
399
+ def _apply_silver_recommendations(self, config: PipelineConfig, registry: RecommendationRegistry) -> None:
400
+ if not hasattr(registry, 'silver') or registry.silver is None:
401
+ return
402
+ for rec in getattr(registry.silver, 'derived_columns', []):
403
+ step = self._map_silver_derived(rec)
404
+ if step:
405
+ config.silver.derived_columns.append(step)
406
+
407
+ def _map_silver_derived(self, rec) -> Optional[TransformationStep]:
408
+ action = rec.action
409
+ params = dict(rec.parameters)
410
+ if action in ("ratio", "interaction", "composite"):
411
+ return TransformationStep(
412
+ type=PipelineTransformationType.DERIVED_COLUMN,
413
+ column=rec.target_column,
414
+ parameters={"action": action, **params},
415
+ rationale=rec.rationale,
416
+ source_notebook=rec.source_notebook,
417
+ )
418
+ return None
419
+
420
+ def _apply_gold_recommendations(self, config: PipelineConfig, registry: RecommendationRegistry) -> None:
421
+ if not hasattr(registry, 'gold') or registry.gold is None:
422
+ return
423
+ gold = registry.gold
424
+ seen_encoding_columns: Set[str] = {e.column for e in config.gold.encodings}
425
+ for rec in getattr(gold, 'encoding', []):
426
+ if rec.target_column in seen_encoding_columns:
427
+ continue
428
+ seen_encoding_columns.add(rec.target_column)
429
+ method = rec.parameters.get("method", rec.action)
430
+ if method in ("onehot", "one_hot"):
431
+ method = "one_hot"
432
+ config.gold.encodings.append(TransformationStep(
433
+ type=PipelineTransformationType.ENCODE,
434
+ column=rec.target_column,
435
+ parameters={"method": method},
436
+ rationale=rec.rationale,
437
+ source_notebook=rec.source_notebook,
438
+ ))
439
+ seen_scaling_columns: Set[str] = {s.column for s in config.gold.scalings}
440
+ for rec in getattr(gold, 'scaling', []):
441
+ if rec.target_column in seen_scaling_columns:
442
+ continue
443
+ seen_scaling_columns.add(rec.target_column)
444
+ config.gold.scalings.append(TransformationStep(
445
+ type=PipelineTransformationType.SCALE,
446
+ column=rec.target_column,
447
+ parameters={"method": rec.parameters.get("method", "standard")},
448
+ rationale=rec.rationale,
449
+ source_notebook=rec.source_notebook,
450
+ ))
451
+ for rec in getattr(gold, 'transformations', []):
452
+ step = self._map_gold_transformation(rec)
453
+ if step:
454
+ config.gold.transformations.append(step)
455
+ prioritized_columns = self._collect_prioritized_columns(gold)
456
+ drop_columns = self._collect_feature_selection_drops(gold, prioritized_columns)
457
+ config.gold.feature_selections = list(drop_columns)
458
+
459
+ def _map_gold_transformation(self, rec) -> Optional[TransformationStep]:
460
+ action = rec.action
461
+ type_map = {
462
+ "log": PipelineTransformationType.LOG_TRANSFORM,
463
+ "log_transform": PipelineTransformationType.LOG_TRANSFORM,
464
+ "sqrt": PipelineTransformationType.SQRT_TRANSFORM,
465
+ "sqrt_transform": PipelineTransformationType.SQRT_TRANSFORM,
466
+ "yeo_johnson": PipelineTransformationType.YEO_JOHNSON,
467
+ "zero_inflation_handling": PipelineTransformationType.ZERO_INFLATION_HANDLING,
468
+ "cap_then_log": PipelineTransformationType.CAP_THEN_LOG,
469
+ }
470
+ trans_type = type_map.get(action)
471
+ if trans_type is None:
472
+ return None
473
+ return TransformationStep(
474
+ type=trans_type,
475
+ column=rec.target_column,
476
+ parameters=dict(rec.parameters) if rec.parameters else {},
477
+ rationale=rec.rationale,
478
+ source_notebook=rec.source_notebook,
479
+ )
480
+
481
+ def _collect_prioritized_columns(self, gold) -> Set[str]:
482
+ prioritized = set()
483
+ for rec in getattr(gold, 'feature_selection', []):
484
+ if rec.action == "prioritize":
485
+ prioritized.add(rec.target_column)
486
+ return prioritized
487
+
488
+ def _collect_feature_selection_drops(self, gold, prioritized: Set[str]) -> Set[str]:
489
+ drops = set()
490
+ for rec in getattr(gold, 'feature_selection', []):
491
+ if rec.action in ("drop_multicollinear", "drop_weak"):
492
+ if rec.target_column not in prioritized:
493
+ drops.add(rec.target_column)
494
+ return drops
495
+
496
+ @staticmethod
497
+ def _resolve_raw_time_column(findings: ExplorationFindings) -> Optional[str]:
498
+ """Get the raw data's time column, preferring datetime_columns over metadata.
499
+
500
+ time_series_metadata.time_column may be a post-processing name
501
+ (e.g. feature_timestamp) that doesn't exist in the raw data.
502
+ datetime_columns contains the original column names.
503
+ """
504
+ ts = findings.time_series_metadata
505
+ metadata_col = ts.time_column if ts else None
506
+ if metadata_col and metadata_col in findings.columns:
507
+ return metadata_col
508
+ if findings.datetime_columns:
509
+ return findings.datetime_columns[0]
510
+ return metadata_col
511
+
512
+ def _build_timestamp_coalesce_config(self, findings: ExplorationFindings) -> Optional[TimestampCoalesceConfig]:
513
+ if len(findings.datetime_ordering) <= 1:
514
+ return None
515
+ output_col = findings.time_series_metadata.time_column if findings.time_series_metadata else "feature_timestamp"
516
+ return TimestampCoalesceConfig(datetime_columns_ordered=findings.datetime_ordering, output_column=output_col)
517
+
518
+ def _build_label_timestamp_config(self, findings: ExplorationFindings) -> Optional[LabelTimestampConfig]:
519
+ if not findings.label_timestamp_column and findings.observation_window_days == 180:
520
+ return None
521
+ return LabelTimestampConfig(
522
+ label_column=findings.label_timestamp_column,
523
+ fallback_window_days=findings.observation_window_days,
524
+ )
525
+
526
+ def _build_landing_configs(self, config: PipelineConfig, multi: MultiDatasetFindings, sources: Dict[str, ExplorationFindings]) -> None:
527
+ for event_name in multi.event_datasets:
528
+ dataset_info = multi.datasets.get(event_name)
529
+ if not dataset_info:
530
+ continue
531
+ findings = sources.get(event_name)
532
+ if not findings:
533
+ continue
534
+ entity_col = (dataset_info.entity_column
535
+ or (findings.time_series_metadata.entity_column if findings.time_series_metadata else None)
536
+ or (findings.identifier_columns[0] if findings.identifier_columns else "id"))
537
+ time_col = (dataset_info.time_column
538
+ or (findings.time_series_metadata.time_column if findings.time_series_metadata else None)
539
+ or "timestamp")
540
+ raw_time_col = self._resolve_raw_time_column(findings)
541
+ raw_source = str(Path(dataset_info.source_path or findings.source_path).resolve())
542
+ source_cfg = next((s for s in config.sources if s.name == event_name), None)
543
+ if not source_cfg:
544
+ continue
545
+ original_target = self._resolve_original_target(findings, config.target_column)
546
+ config.landing[event_name] = LandingLayerConfig(
547
+ source=source_cfg,
548
+ raw_source_path=raw_source,
549
+ raw_source_format=self._infer_format(raw_source),
550
+ entity_column=entity_col,
551
+ time_column=time_col,
552
+ target_column=config.target_column,
553
+ original_target_column=original_target,
554
+ raw_time_column=raw_time_col if raw_time_col and raw_time_col != time_col else None,
555
+ timestamp_coalesce=self._build_timestamp_coalesce_config(findings),
556
+ label_timestamp=self._build_label_timestamp_config(findings),
557
+ )
558
+
559
+ @staticmethod
560
+ def _resolve_original_target(findings: ExplorationFindings, target_column: str) -> Optional[str]:
561
+ original = findings.metadata.get("original_target_column") if findings.metadata else None
562
+ if original and original != target_column:
563
+ return original
564
+ return None
565
+
566
+ def _build_aggregation_config(self, multi: MultiDatasetFindings, findings: ExplorationFindings) -> Optional[AggregationWindowConfig]:
567
+ windows = getattr(multi, 'aggregation_windows', None) or []
568
+ if not windows and findings.time_series_metadata:
569
+ windows = getattr(findings.time_series_metadata, 'suggested_aggregations', []) or []
570
+ if not windows:
571
+ return None
572
+ value_columns = []
573
+ for col_name, col_finding in findings.columns.items():
574
+ col_type = _resolve_col_type(col_finding)
575
+ if col_type in ("numeric_continuous", "numeric_discrete", "numeric", "binary"):
576
+ if col_name not in (findings.target_column or ""):
577
+ value_columns.append(col_name)
578
+ return AggregationWindowConfig(
579
+ windows=windows,
580
+ value_columns=value_columns,
581
+ agg_funcs=["sum", "mean", "max", "count"],
582
+ )
583
+
584
+ def _build_lifecycle_config(self, multi: MultiDatasetFindings) -> Optional[LifecycleConfig]:
585
+ notes = getattr(multi, 'notes', None)
586
+ if not notes:
587
+ return None
588
+ temporal_config = notes.get("temporal_config", {}) if isinstance(notes, dict) else {}
589
+ feature_groups = temporal_config.get("feature_groups", [])
590
+ return LifecycleConfig(
591
+ include_lifecycle_quadrant="lifecycle" in feature_groups,
592
+ include_cyclical_features="regularity" in feature_groups,
593
+ include_recency_bucket="recency" in feature_groups,
594
+ momentum_pairs=[],
595
+ )
596
+
597
+ def _build_bronze_event_configs(
598
+ self,
599
+ config: PipelineConfig,
600
+ multi: MultiDatasetFindings,
601
+ source_findings: Dict[str, ExplorationFindings],
602
+ discovered_events: Optional[Dict[str, ExplorationFindings]] = None,
603
+ ) -> None:
604
+ lifecycle_config = self._build_lifecycle_config(multi)
605
+ for event_name in multi.event_datasets:
606
+ findings = source_findings.get(event_name)
607
+ if not findings:
608
+ continue
609
+ source_cfg = next((s for s in config.sources if s.name == event_name), None)
610
+ if not source_cfg:
611
+ continue
612
+ dataset_info = multi.datasets.get(event_name)
613
+ entity_col = (dataset_info.entity_column if dataset_info else None) or source_cfg.entity_key
614
+ time_col = (dataset_info.time_column if dataset_info else None) or source_cfg.time_column or "timestamp"
615
+ raw_time_col = self._resolve_raw_time_column(findings)
616
+ config.bronze_event[event_name] = BronzeEventConfig(
617
+ source=source_cfg, entity_column=entity_col, time_column=time_col,
618
+ deduplicate=True,
619
+ pre_shaping=self._extract_transformations(findings),
620
+ aggregation=self._build_aggregation_config(multi, findings),
621
+ lifecycle=lifecycle_config,
622
+ raw_time_column=raw_time_col if raw_time_col and raw_time_col != time_col else None,
623
+ )
624
+ for agg_name, preagg in (discovered_events or {}).items():
625
+ if agg_name in config.bronze_event:
626
+ continue
627
+ source_cfg = next((s for s in config.sources if s.name == agg_name), None)
628
+ if not source_cfg:
629
+ continue
630
+ ts = preagg.time_series_metadata
631
+ entity_col = (ts.entity_column if ts else None) or source_cfg.entity_key
632
+ time_col = (ts.time_column if ts else None) or source_cfg.time_column or "timestamp"
633
+ raw_time_col = self._resolve_raw_time_column(preagg)
634
+ config.bronze_event[agg_name] = BronzeEventConfig(
635
+ source=source_cfg, entity_column=entity_col, time_column=time_col,
636
+ deduplicate=True,
637
+ pre_shaping=self._extract_transformations(preagg),
638
+ aggregation=self._build_aggregation_config(multi, preagg),
639
+ lifecycle=lifecycle_config,
640
+ raw_time_column=raw_time_col if raw_time_col and raw_time_col != time_col else None,
641
+ )
642
+
643
+ def _discover_event_sources(self, source_findings: Dict[str, ExplorationFindings]) -> Dict[str, ExplorationFindings]:
644
+ index = self._build_aggregated_path_index()
645
+ if not index:
646
+ return {}
647
+ return self._scan_for_preagg_findings(index)
648
+
649
+ def _build_aggregated_path_index(self) -> Dict[Path, str]:
650
+ return {path: name for name, path in self._source_findings_paths.items()}
651
+
652
+ def _scan_for_preagg_findings(self, index: Dict[Path, str]) -> Dict[str, ExplorationFindings]:
653
+ loaded_paths = set(self._source_findings_paths.values())
654
+ result: Dict[str, ExplorationFindings] = {}
655
+ for candidate in self._findings_dir.glob("*_findings.yaml"):
656
+ resolved = candidate.resolve()
657
+ if resolved in loaded_paths:
658
+ continue
659
+ if candidate.name == "multi_dataset_findings.yaml":
660
+ continue
661
+ try:
662
+ preagg = ExplorationFindings.load(str(candidate))
663
+ except Exception:
664
+ continue
665
+ source_name = self._match_preagg_to_source(preagg, index)
666
+ if source_name is not None:
667
+ result[source_name] = preagg
668
+ return result
669
+
670
+ def _match_preagg_to_source(self, preagg: ExplorationFindings, index: Dict[Path, str]) -> Optional[str]:
671
+ if not preagg.has_aggregated_output:
672
+ return None
673
+ agg_path_str = preagg.time_series_metadata.aggregated_findings_path
674
+ if not agg_path_str:
675
+ return None
676
+ agg_path = Path(agg_path_str).resolve()
677
+ return index.get(agg_path)
678
+
679
+ def _build_discovered_landing_configs(
680
+ self,
681
+ config: PipelineConfig,
682
+ discovered: Dict[str, ExplorationFindings],
683
+ multi: MultiDatasetFindings,
684
+ ) -> None:
685
+ for agg_name, preagg in discovered.items():
686
+ if agg_name in config.landing:
687
+ continue
688
+ source_cfg = next((s for s in config.sources if s.name == agg_name), None)
689
+ if not source_cfg:
690
+ continue
691
+ ts = preagg.time_series_metadata
692
+ entity_col = (ts.entity_column if ts else None) or source_cfg.entity_key
693
+ time_col = (ts.time_column if ts else None) or "timestamp"
694
+ raw_time_col = self._resolve_raw_time_column(preagg)
695
+ source_cfg.is_event_level = True
696
+ source_cfg.time_column = time_col
697
+ source_cfg.entity_key = entity_col
698
+ raw_source = str(Path(preagg.source_path).resolve())
699
+ original_target = self._resolve_original_target(preagg, config.target_column)
700
+ config.landing[agg_name] = LandingLayerConfig(
701
+ source=source_cfg,
702
+ raw_source_path=raw_source,
703
+ raw_source_format=self._infer_format(raw_source),
704
+ entity_column=entity_col,
705
+ time_column=time_col,
706
+ target_column=config.target_column,
707
+ original_target_column=original_target,
708
+ raw_time_column=raw_time_col if raw_time_col and raw_time_col != time_col else None,
709
+ timestamp_coalesce=self._build_timestamp_coalesce_config(preagg),
710
+ label_timestamp=self._build_label_timestamp_config(preagg),
711
+ )
712
+
713
+ @staticmethod
714
+ def _reconcile_discovered_event_transforms(config: "PipelineConfig", discovered_events: Dict[str, ExplorationFindings]) -> None:
715
+ if not discovered_events:
716
+ return
717
+ for name in list(discovered_events.keys()):
718
+ if name in config.bronze and name in config.bronze_event:
719
+ config.bronze_event[name].post_shaping.extend(config.bronze[name].transformations)
720
+ del config.bronze[name]
721
+
722
+ @staticmethod
723
+ def _infer_format(path: str) -> str:
724
+ ext = Path(path).suffix.lower()
725
+ if ext == ".csv":
726
+ return "csv"
727
+ return "parquet"