churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,34 @@
1
+ from typing import List
2
+
3
+ from customer_retention.core.compat import ops
4
+ from customer_retention.generators.orchestration.context import PipelineContext
5
+
6
+ from ..base import Component, ComponentResult
7
+
8
+
9
+ class Ingester(Component):
10
+ def __init__(self):
11
+ super().__init__(name="Ingester", chapters=[1])
12
+
13
+ def validate_inputs(self, context: PipelineContext) -> List[str]:
14
+ errors = []
15
+ if not context.raw_data_path:
16
+ errors.append("raw_data_path is required")
17
+ return errors
18
+
19
+ def run(self, context: PipelineContext) -> ComponentResult:
20
+ self._start_timer()
21
+ try:
22
+ path = context.raw_data_path
23
+ df = ops.read_csv(path)
24
+ context.current_df = df
25
+ context.current_stage = "bronze"
26
+ row_count = len(df)
27
+ col_count = len(df.columns)
28
+ return self.create_result(
29
+ success=True,
30
+ artifacts={"bronze_data": context.bronze_path} if context.bronze_path else {},
31
+ metrics={"row_count": row_count, "column_count": col_count}
32
+ )
33
+ except Exception as e:
34
+ return self.create_result(success=False, errors=[str(e)])
@@ -0,0 +1,34 @@
1
+ from typing import List
2
+
3
+ from customer_retention.generators.orchestration.context import PipelineContext
4
+
5
+ from ..base import Component, ComponentResult
6
+
7
+
8
+ class Profiler(Component):
9
+ def __init__(self):
10
+ super().__init__(name="Profiler", chapters=[2])
11
+
12
+ def validate_inputs(self, context: PipelineContext) -> List[str]:
13
+ errors = []
14
+ if context.current_df is None:
15
+ errors.append("No DataFrame available for profiling")
16
+ return errors
17
+
18
+ def run(self, context: PipelineContext) -> ComponentResult:
19
+ self._start_timer()
20
+ try:
21
+ from customer_retention.stages.profiling.column_profiler import ColumnProfiler
22
+ from customer_retention.stages.profiling.type_detector import TypeDetector
23
+ df = context.current_df
24
+ type_detector = TypeDetector()
25
+ type_results = type_detector.detect_all(df)
26
+ profiler = ColumnProfiler()
27
+ profile = profiler.profile_all(df)
28
+ context.profiling_results = {"types": type_results, "profile": profile}
29
+ return self.create_result(
30
+ success=True,
31
+ metrics={"columns_profiled": len(df.columns)}
32
+ )
33
+ except Exception as e:
34
+ return self.create_result(success=False, errors=[str(e)])
@@ -0,0 +1,38 @@
1
+ from typing import List
2
+
3
+ from customer_retention.generators.orchestration.context import PipelineContext
4
+
5
+ from ..base import Component, ComponentResult
6
+
7
+
8
+ class Trainer(Component):
9
+ def __init__(self):
10
+ super().__init__(name="Trainer", chapters=[5])
11
+
12
+ def validate_inputs(self, context: PipelineContext) -> List[str]:
13
+ errors = []
14
+ if context.current_df is None:
15
+ errors.append("No DataFrame available for training")
16
+ if not context.target_column:
17
+ errors.append("target_column is required for training")
18
+ return errors
19
+
20
+ def run(self, context: PipelineContext) -> ComponentResult:
21
+ self._start_timer()
22
+ try:
23
+ from customer_retention.stages.modeling.baseline_trainer import BaselineTrainer
24
+ from customer_retention.stages.modeling.data_splitter import DataSplitter
25
+ df = context.current_df
26
+ target = context.target_column
27
+ splitter = DataSplitter()
28
+ X_train, X_test, y_train, y_test = splitter.split(df, target)
29
+ trainer = BaselineTrainer()
30
+ results = trainer.train_all(X_train, y_train, X_test, y_test)
31
+ context.model_results = results
32
+ best_model = max(results, key=lambda x: results[x].get("pr_auc", 0))
33
+ return self.create_result(
34
+ success=True,
35
+ metrics={"best_model": best_model, "pr_auc": results[best_model].get("pr_auc", 0)}
36
+ )
37
+ except Exception as e:
38
+ return self.create_result(success=False, errors=[str(e)])
@@ -0,0 +1,36 @@
1
+ from typing import List
2
+
3
+ from customer_retention.generators.orchestration.context import PipelineContext
4
+
5
+ from ..base import Component, ComponentResult
6
+
7
+
8
+ class Transformer(Component):
9
+ def __init__(self):
10
+ super().__init__(name="Transformer", chapters=[3])
11
+
12
+ def validate_inputs(self, context: PipelineContext) -> List[str]:
13
+ errors = []
14
+ if context.current_df is None:
15
+ errors.append("No DataFrame available for transformation")
16
+ return errors
17
+
18
+ def run(self, context: PipelineContext) -> ComponentResult:
19
+ self._start_timer()
20
+ try:
21
+ from customer_retention.stages.cleaning.missing_handler import MissingHandler
22
+ from customer_retention.stages.cleaning.outlier_handler import OutlierHandler
23
+ df = context.current_df
24
+ missing_handler = MissingHandler()
25
+ df = missing_handler.handle(df)
26
+ outlier_handler = OutlierHandler()
27
+ df = outlier_handler.handle(df)
28
+ context.current_df = df
29
+ context.current_stage = "silver"
30
+ return self.create_result(
31
+ success=True,
32
+ artifacts={"silver_data": context.silver_path} if context.silver_path else {},
33
+ metrics={"row_count": len(df)}
34
+ )
35
+ except Exception as e:
36
+ return self.create_result(success=False, errors=[str(e)])
@@ -0,0 +1,37 @@
1
+ from typing import List
2
+
3
+ from customer_retention.generators.orchestration.context import PipelineContext
4
+
5
+ from ..base import Component, ComponentResult
6
+
7
+
8
+ class Validator(Component):
9
+ def __init__(self):
10
+ super().__init__(name="Validator", chapters=[6])
11
+
12
+ def validate_inputs(self, context: PipelineContext) -> List[str]:
13
+ errors = []
14
+ if not context.model_results:
15
+ errors.append("No model results available for validation")
16
+ return errors
17
+
18
+ def run(self, context: PipelineContext) -> ComponentResult:
19
+ self._start_timer()
20
+ try:
21
+ from customer_retention.analysis.diagnostics.calibration_analyzer import CalibrationAnalyzer
22
+ from customer_retention.analysis.diagnostics.leakage_detector import LeakageDetector
23
+ from customer_retention.analysis.diagnostics.overfitting_analyzer import OverfittingAnalyzer
24
+ LeakageDetector()
25
+ OverfittingAnalyzer()
26
+ CalibrationAnalyzer()
27
+ context.validation_results = {
28
+ "leakage": "checked",
29
+ "overfitting": "checked",
30
+ "calibration": "checked"
31
+ }
32
+ return self.create_result(
33
+ success=True,
34
+ metrics={"diagnostics_run": 3}
35
+ )
36
+ except Exception as e:
37
+ return self.create_result(success=False, errors=[str(e)])
@@ -0,0 +1,33 @@
1
+ from enum import Enum
2
+
3
+
4
+ class Severity(str, Enum):
5
+ CRITICAL = "critical"
6
+ HIGH = "high"
7
+ WARNING = "warning"
8
+ MEDIUM = "medium"
9
+ LOW = "low"
10
+ INFO = "info"
11
+
12
+
13
+ class ModelType(Enum):
14
+ LOGISTIC_REGRESSION = "logistic_regression"
15
+ RANDOM_FOREST = "random_forest"
16
+ XGBOOST = "xgboost"
17
+ LIGHTGBM = "lightgbm"
18
+ CATBOOST = "catboost"
19
+
20
+
21
+ class RiskSegment(Enum):
22
+ """Customer risk segmentation levels."""
23
+ CRITICAL = "Critical"
24
+ HIGH = "High"
25
+ MEDIUM = "Medium"
26
+ LOW = "Low"
27
+ VERY_LOW = "Very Low"
28
+
29
+
30
+ class Platform(str, Enum):
31
+ """Deployment platform options."""
32
+ LOCAL = "local"
33
+ DATABRICKS = "databricks"
@@ -0,0 +1,94 @@
1
+ import time
2
+ from dataclasses import dataclass
3
+ from typing import TYPE_CHECKING, Dict, List
4
+
5
+ from .base import Component, ComponentResult, ComponentStatus
6
+ from .registry import ComponentRegistry
7
+
8
+ if TYPE_CHECKING:
9
+ from customer_retention.generators.orchestration.context import PipelineContext
10
+
11
+
12
+ @dataclass
13
+ class OrchestratorResult:
14
+ success: bool
15
+ components_run: List[str]
16
+ results: Dict[str, ComponentResult]
17
+ total_duration_seconds: float
18
+
19
+ def get_summary(self) -> str:
20
+ status = "SUCCESS" if self.success else "FAILED"
21
+ return f"{status}: {len(self.components_run)} components in {self.total_duration_seconds:.1f}s"
22
+
23
+
24
+ class Orchestrator:
25
+ def __init__(self, registry: ComponentRegistry, context: "PipelineContext"):
26
+ self.registry = registry
27
+ self.context = context
28
+
29
+ def run_training(self) -> OrchestratorResult:
30
+ return self.run_chapters([1, 2, 3, 4, 5, 6, 7])
31
+
32
+ def run_phase(self, phase: str) -> OrchestratorResult:
33
+ start_time = time.time()
34
+ registrations = self.registry.get_phase_components(phase)
35
+ components_run = []
36
+ results = {}
37
+ success = True
38
+ for reg in registrations:
39
+ name = self._get_name_for_registration(reg)
40
+ result = self._run_component(reg.component_class)
41
+ results[name] = result
42
+ components_run.append(name)
43
+ if not result.success:
44
+ success = False
45
+ break
46
+ return OrchestratorResult(
47
+ success=success,
48
+ components_run=components_run,
49
+ results=results,
50
+ total_duration_seconds=time.time() - start_time
51
+ )
52
+
53
+ def run_chapters(self, chapters: List[int]) -> OrchestratorResult:
54
+ start_time = time.time()
55
+ registrations = self.registry.get_chapters_components(chapters)
56
+ components_run = []
57
+ results = {}
58
+ success = True
59
+ for reg in registrations:
60
+ name = self._get_name_for_registration(reg)
61
+ result = self._run_component(reg.component_class)
62
+ results[name] = result
63
+ components_run.append(name)
64
+ if not result.success:
65
+ success = False
66
+ break
67
+ return OrchestratorResult(
68
+ success=success,
69
+ components_run=components_run,
70
+ results=results,
71
+ total_duration_seconds=time.time() - start_time
72
+ )
73
+
74
+ def run_single(self, component_name: str) -> ComponentResult:
75
+ reg = self.registry.get_component(component_name)
76
+ return self._run_component(reg.component_class)
77
+
78
+ def _run_component(self, component_class: type) -> ComponentResult:
79
+ component: Component = component_class()
80
+ errors = component.validate_inputs(self.context)
81
+ if errors:
82
+ return ComponentResult(
83
+ success=False, status=ComponentStatus.FAILED,
84
+ errors=errors
85
+ )
86
+ if component.should_skip(self.context):
87
+ return ComponentResult(success=True, status=ComponentStatus.SKIPPED)
88
+ return component.run(self.context)
89
+
90
+ def _get_name_for_registration(self, reg) -> str:
91
+ for name, r in self.registry._components.items():
92
+ if r == reg:
93
+ return name
94
+ return reg.component_class.__name__.lower()
@@ -0,0 +1,59 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Dict, List, Optional, Type
3
+
4
+ from .base import Component
5
+
6
+
7
+ @dataclass
8
+ class ComponentRegistration:
9
+ component_class: Type[Component]
10
+ phase: str
11
+ dependencies: List[str] = field(default_factory=list)
12
+
13
+
14
+ class ComponentRegistry:
15
+ PHASES = ["discovery", "data_preparation", "model_development", "production"]
16
+
17
+ def __init__(self):
18
+ self._components: Dict[str, ComponentRegistration] = {}
19
+
20
+ def register(self, name: str, component_class: Type[Component], phase: str,
21
+ dependencies: Optional[List[str]] = None) -> None:
22
+ self._components[name] = ComponentRegistration(
23
+ component_class=component_class,
24
+ phase=phase,
25
+ dependencies=dependencies or []
26
+ )
27
+
28
+ def get_component(self, name: str) -> ComponentRegistration:
29
+ if name not in self._components:
30
+ raise KeyError(f"Component '{name}' not found")
31
+ return self._components[name]
32
+
33
+ def get_phase_components(self, phase: str) -> List[ComponentRegistration]:
34
+ return [reg for reg in self._components.values() if reg.phase == phase]
35
+
36
+ def get_chapters_components(self, chapters: List[int]) -> List[ComponentRegistration]:
37
+ result = []
38
+ for reg in self._components.values():
39
+ instance = reg.component_class()
40
+ if any(ch in instance.chapters for ch in chapters):
41
+ result.append(reg)
42
+ return result
43
+
44
+ def list_components(self) -> List[str]:
45
+ return list(self._components.keys())
46
+
47
+
48
+ def get_default_registry() -> ComponentRegistry:
49
+ from .components import Deployer, Explainer, FeatureEngineer, Ingester, Profiler, Trainer, Transformer, Validator
50
+ registry = ComponentRegistry()
51
+ registry.register("ingester", Ingester, "data_preparation")
52
+ registry.register("profiler", Profiler, "data_preparation", ["ingester"])
53
+ registry.register("transformer", Transformer, "data_preparation", ["profiler"])
54
+ registry.register("feature_engineer", FeatureEngineer, "data_preparation", ["transformer"])
55
+ registry.register("trainer", Trainer, "model_development", ["feature_engineer"])
56
+ registry.register("validator", Validator, "model_development", ["trainer"])
57
+ registry.register("explainer", Explainer, "model_development", ["trainer"])
58
+ registry.register("deployer", Deployer, "production", ["validator"])
59
+ return registry
@@ -0,0 +1,39 @@
1
+ from .column_config import ColumnConfig, ColumnType, DatasetGranularity
2
+ from .experiments import (
3
+ DATA_DIR,
4
+ EXPERIMENTS_DIR,
5
+ FEATURE_STORE_DIR,
6
+ FINDINGS_DIR,
7
+ MLRUNS_DIR,
8
+ OUTPUT_DIR,
9
+ get_data_dir,
10
+ get_experiments_dir,
11
+ get_feature_store_dir,
12
+ get_findings_dir,
13
+ get_mlruns_dir,
14
+ get_notebook_experiments_dir,
15
+ setup_experiments_structure,
16
+ )
17
+ from .pipeline_config import (
18
+ BronzeConfig,
19
+ DedupStrategy,
20
+ GoldConfig,
21
+ ModelingConfig,
22
+ PathConfig,
23
+ PipelineConfig,
24
+ SilverConfig,
25
+ ValidationConfig,
26
+ )
27
+ from .source_config import DataSourceConfig, FileFormat, Grain, SourceType
28
+
29
+ __all__ = [
30
+ "ColumnType", "ColumnConfig", "DatasetGranularity",
31
+ "SourceType", "FileFormat", "Grain", "DataSourceConfig",
32
+ "DedupStrategy", "BronzeConfig", "SilverConfig", "GoldConfig",
33
+ "ModelingConfig", "ValidationConfig", "PathConfig", "PipelineConfig",
34
+ "EXPERIMENTS_DIR", "FINDINGS_DIR", "DATA_DIR", "MLRUNS_DIR",
35
+ "FEATURE_STORE_DIR", "OUTPUT_DIR", "get_experiments_dir",
36
+ "get_findings_dir", "get_data_dir", "get_mlruns_dir",
37
+ "get_feature_store_dir", "get_notebook_experiments_dir",
38
+ "setup_experiments_structure",
39
+ ]
@@ -0,0 +1,95 @@
1
+ from enum import Enum
2
+ from typing import Optional
3
+
4
+ from pydantic import BaseModel, model_validator
5
+
6
+
7
+ class ColumnType(str, Enum):
8
+ IDENTIFIER = "identifier"
9
+ TARGET = "target"
10
+ FEATURE_TIMESTAMP = "feature_timestamp"
11
+ LABEL_TIMESTAMP = "label_timestamp"
12
+ NUMERIC_CONTINUOUS = "numeric_continuous"
13
+ NUMERIC_DISCRETE = "numeric_discrete"
14
+ CATEGORICAL_NOMINAL = "categorical_nominal"
15
+ CATEGORICAL_ORDINAL = "categorical_ordinal"
16
+ CATEGORICAL_CYCLICAL = "categorical_cyclical"
17
+ DATETIME = "datetime"
18
+ BINARY = "binary"
19
+ TEXT = "text"
20
+ UNKNOWN = "unknown"
21
+
22
+
23
+ # Column types that should NEVER be used as features (leakage risk)
24
+ NON_FEATURE_COLUMN_TYPES = frozenset({
25
+ ColumnType.IDENTIFIER,
26
+ ColumnType.TARGET,
27
+ ColumnType.FEATURE_TIMESTAMP,
28
+ ColumnType.LABEL_TIMESTAMP,
29
+ })
30
+
31
+
32
+ class DatasetGranularity(str, Enum):
33
+ """Describes the grain/granularity of a dataset.
34
+
35
+ ENTITY_LEVEL: One row per entity (e.g., one row per customer)
36
+ EVENT_LEVEL: Multiple rows per entity over time (e.g., transactions, emails)
37
+ UNKNOWN: Cannot determine granularity
38
+ """
39
+ ENTITY_LEVEL = "entity_level"
40
+ EVENT_LEVEL = "event_level"
41
+ UNKNOWN = "unknown"
42
+
43
+
44
+ class ColumnConfig(BaseModel):
45
+ name: str
46
+ column_type: ColumnType
47
+ nullable: bool = True
48
+
49
+ encoding_strategy: Optional[str] = None
50
+ scaling_strategy: Optional[str] = None
51
+ missing_strategy: Optional[str] = None
52
+ ordinal_order: Optional[list[str]] = None
53
+ cyclical_max: Optional[int] = None
54
+
55
+ min_value: Optional[float] = None
56
+ max_value: Optional[float] = None
57
+ allowed_values: Optional[list[str]] = None
58
+ regex_pattern: Optional[str] = None
59
+
60
+ description: Optional[str] = None
61
+ business_name: Optional[str] = None
62
+ is_feature: Optional[bool] = None
63
+ exclude_from_model: bool = False
64
+
65
+ @model_validator(mode='after')
66
+ def validate_cyclical_and_ordinal(self):
67
+ if self.column_type == ColumnType.CATEGORICAL_CYCLICAL and self.cyclical_max is None:
68
+ raise ValueError("cyclical_max required for CATEGORICAL_CYCLICAL columns")
69
+ if self.column_type == ColumnType.CATEGORICAL_ORDINAL and self.ordinal_order is None:
70
+ raise ValueError("ordinal_order required for CATEGORICAL_ORDINAL columns")
71
+ return self
72
+
73
+ def should_be_used_as_feature(self) -> bool:
74
+ if self.exclude_from_model:
75
+ return False
76
+ if self.is_feature is not None:
77
+ return self.is_feature
78
+ return self.column_type not in NON_FEATURE_COLUMN_TYPES
79
+
80
+ def is_categorical(self) -> bool:
81
+ return self.column_type in [
82
+ ColumnType.CATEGORICAL_NOMINAL,
83
+ ColumnType.CATEGORICAL_ORDINAL,
84
+ ColumnType.CATEGORICAL_CYCLICAL,
85
+ ColumnType.BINARY
86
+ ]
87
+
88
+ def is_numeric(self) -> bool:
89
+ return self.column_type in [
90
+ ColumnType.NUMERIC_CONTINUOUS,
91
+ ColumnType.NUMERIC_DISCRETE
92
+ ]
93
+
94
+ def is_temporal(self) -> bool:
95
+ return self.column_type == ColumnType.DATETIME
@@ -0,0 +1,71 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+
6
+ def _find_project_root() -> Path:
7
+ path = Path(__file__).parent
8
+ for _ in range(10):
9
+ if (path / "pyproject.toml").exists() or (path / ".git").exists():
10
+ return path
11
+ path = path.parent
12
+ return Path.cwd()
13
+
14
+
15
+ def get_experiments_dir(default: Optional[str] = None) -> Path:
16
+ if "CR_EXPERIMENTS_DIR" in os.environ:
17
+ return Path(os.environ["CR_EXPERIMENTS_DIR"])
18
+ if default:
19
+ return Path(default)
20
+ return _find_project_root() / "experiments"
21
+
22
+
23
+ def get_findings_dir(default: Optional[str] = None) -> Path:
24
+ return get_experiments_dir(default) / "findings"
25
+
26
+
27
+ def get_data_dir(default: Optional[str] = None) -> Path:
28
+ return get_experiments_dir(default) / "data"
29
+
30
+
31
+ def get_mlruns_dir(default: Optional[str] = None) -> Path:
32
+ return get_experiments_dir(default) / "mlruns"
33
+
34
+
35
+ def get_feature_store_dir(default: Optional[str] = None) -> Path:
36
+ return get_experiments_dir(default) / "feature_repo"
37
+
38
+
39
+ EXPERIMENTS_DIR = get_experiments_dir()
40
+ FINDINGS_DIR = get_findings_dir()
41
+ DATA_DIR = get_data_dir()
42
+ MLRUNS_DIR = get_mlruns_dir()
43
+ FEATURE_STORE_DIR = get_feature_store_dir()
44
+ OUTPUT_DIR = FINDINGS_DIR
45
+
46
+
47
+ def setup_experiments_structure(experiments_dir: Optional[Path] = None) -> None:
48
+ base = experiments_dir or get_experiments_dir()
49
+ directories = [
50
+ base / "findings" / "snapshots",
51
+ base / "findings" / "unified",
52
+ base / "data" / "bronze",
53
+ base / "data" / "silver",
54
+ base / "data" / "gold",
55
+ base / "data" / "scoring",
56
+ base / "mlruns",
57
+ base / "feature_repo" / "data",
58
+ ]
59
+ for directory in directories:
60
+ directory.mkdir(parents=True, exist_ok=True)
61
+
62
+
63
+ def get_notebook_experiments_dir() -> Path:
64
+ if "CR_EXPERIMENTS_DIR" in os.environ:
65
+ return Path(os.environ["CR_EXPERIMENTS_DIR"])
66
+ cwd = Path.cwd()
67
+ if (cwd.parent / "experiments").exists():
68
+ return cwd.parent / "experiments"
69
+ elif (cwd / "experiments").exists():
70
+ return cwd / "experiments"
71
+ return get_experiments_dir()