churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,201 @@
1
+ from customer_retention.analysis.auto_explorer.findings import ExplorationFindings
2
+ from customer_retention.core.config.column_config import ColumnType
3
+
4
+
5
+ class LLMContextBuilder:
6
+ def __init__(self,
7
+ include_databricks: bool = False,
8
+ include_framework_docs: bool = True,
9
+ max_sample_values: int = 10):
10
+ self.include_databricks = include_databricks
11
+ self.include_framework_docs = include_framework_docs
12
+ self.max_sample_values = max_sample_values
13
+
14
+ def build_exploration_context(self, findings: ExplorationFindings) -> str:
15
+ lines = [
16
+ "# Data Exploration Context",
17
+ "",
18
+ "## Dataset Overview",
19
+ f"- **Source:** {findings.source_path}",
20
+ f"- **Format:** {findings.source_format}",
21
+ f"- **Rows:** {findings.row_count:,}",
22
+ f"- **Columns:** {findings.column_count}",
23
+ f"- **Overall Quality Score:** {findings.overall_quality_score:.1f}/100",
24
+ ""
25
+ ]
26
+ if findings.target_column:
27
+ lines.extend([
28
+ "## Target Information",
29
+ f"- **Target Column:** {findings.target_column}",
30
+ f"- **Target Type:** {findings.target_type}",
31
+ ""
32
+ ])
33
+ lines.extend([
34
+ "## Column Details",
35
+ "",
36
+ "| Column | Type | Confidence | Nulls | Notes |",
37
+ "|--------|------|------------|-------|-------|"
38
+ ])
39
+ for name, col in findings.columns.items():
40
+ null_pct = col.universal_metrics.get("null_percentage", 0)
41
+ notes = "; ".join(col.evidence[:2]) if col.evidence else ""
42
+ lines.append(
43
+ f"| {name} | {col.inferred_type.value} | {col.confidence:.0%} | {null_pct:.1f}% | {notes[:50]} |"
44
+ )
45
+ lines.append("")
46
+ lines.extend(self._build_column_details(findings))
47
+ if findings.critical_issues:
48
+ lines.extend([
49
+ "## Critical Issues",
50
+ ""
51
+ ])
52
+ for issue in findings.critical_issues:
53
+ lines.append(f"- {issue}")
54
+ lines.append("")
55
+ if findings.warnings:
56
+ lines.extend([
57
+ "## Warnings",
58
+ ""
59
+ ])
60
+ for warning in findings.warnings:
61
+ lines.append(f"- {warning}")
62
+ lines.append("")
63
+ return "\n".join(lines)
64
+
65
+ def _build_column_details(self, findings: ExplorationFindings) -> list:
66
+ lines = ["## Detailed Column Information", ""]
67
+ for name, col in findings.columns.items():
68
+ lines.append(f"### {name}")
69
+ lines.append(f"- **Type:** {col.inferred_type.value}")
70
+ lines.append(f"- **Confidence:** {col.confidence:.0%}")
71
+ if col.universal_metrics:
72
+ metrics = col.universal_metrics
73
+ lines.append(f"- **Null Count:** {metrics.get('null_count', 0)}")
74
+ lines.append(f"- **Distinct Count:** {metrics.get('distinct_count', 'N/A')}")
75
+ if col.type_metrics:
76
+ metrics = col.type_metrics
77
+ if "mean" in metrics:
78
+ lines.append(f"- **Mean:** {metrics['mean']:.2f}")
79
+ if "std" in metrics:
80
+ lines.append(f"- **Std:** {metrics['std']:.2f}")
81
+ if "min_value" in metrics:
82
+ lines.append(f"- **Range:** {metrics['min_value']} to {metrics.get('max_value', 'N/A')}")
83
+ if "top_categories" in metrics:
84
+ top = metrics["top_categories"][:3]
85
+ lines.append(f"- **Top Categories:** {top}")
86
+ lines.append("")
87
+ return lines
88
+
89
+ def build_configuration_context(self, findings: ExplorationFindings, user_goal: str) -> str:
90
+ lines = [
91
+ "# Pipeline Configuration Context",
92
+ "",
93
+ "## User Goal",
94
+ f"{user_goal}",
95
+ "",
96
+ self.build_exploration_context(findings),
97
+ "",
98
+ "## Recommendations Summary",
99
+ ""
100
+ ]
101
+ lines.append("### Suggested Transformations")
102
+ for name, col in findings.columns.items():
103
+ if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:
104
+ lines.append(f"- **{name}:** Apply standard scaling")
105
+ elif col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]:
106
+ lines.append(f"- **{name}:** Apply encoding (one-hot or target)")
107
+ elif col.inferred_type == ColumnType.DATETIME:
108
+ lines.append(f"- **{name}:** Extract temporal features")
109
+ lines.append("")
110
+ return "\n".join(lines)
111
+
112
+ def build_databricks_context(self, findings: ExplorationFindings) -> str:
113
+ lines = [
114
+ "# Databricks Integration Context",
115
+ "",
116
+ "## Available Databricks Features",
117
+ "",
118
+ "### Delta Lake",
119
+ "- ACID transactions for data reliability",
120
+ "- Schema enforcement and evolution",
121
+ "- Time travel for data versioning",
122
+ "",
123
+ "### Delta Live Tables (DLT)",
124
+ "- Declarative pipeline definitions",
125
+ "- Automatic dependency management",
126
+ "- Built-in expectations for quality",
127
+ "",
128
+ "### Unity Catalog",
129
+ "- Centralized data governance",
130
+ "- Fine-grained access control",
131
+ "- Data lineage tracking",
132
+ "",
133
+ "### Feature Store",
134
+ "- Centralized feature repository",
135
+ "- Point-in-time feature lookups",
136
+ "- Online/offline feature serving",
137
+ "",
138
+ "### Spark Considerations",
139
+ f"- Dataset has {findings.row_count:,} rows",
140
+ ]
141
+ if findings.row_count > 1_000_000:
142
+ lines.append("- Consider partitioning for large dataset")
143
+ lines.extend([
144
+ "- Use DataFrame API for transformations",
145
+ "- Leverage Spark ML for scalable modeling",
146
+ ""
147
+ ])
148
+ return "\n".join(lines)
149
+
150
+ def build_framework_docs_context(self) -> str:
151
+ lines = [
152
+ "# Customer Retention Framework Documentation",
153
+ "",
154
+ "## ColumnType Reference",
155
+ "",
156
+ "Available column types in the framework:",
157
+ ""
158
+ ]
159
+ for col_type in ColumnType:
160
+ lines.append(f"- **{col_type.name}:** {col_type.value}")
161
+ lines.extend([
162
+ "",
163
+ "## Key Modules",
164
+ "",
165
+ "### Profiling",
166
+ "- TypeDetector: Automatic type inference",
167
+ "- ColumnProfiler: Statistical profiling per type",
168
+ "- QualityChecks: Data quality validation",
169
+ "",
170
+ "### Transformation",
171
+ "- NumericTransformer: Scaling, log transforms, binning",
172
+ "- CategoricalEncoder: One-hot, target, ordinal encoding",
173
+ "- DatetimeTransformer: Temporal feature extraction",
174
+ "",
175
+ "### Modeling",
176
+ "- BaselineTrainer: Quick baseline models",
177
+ "- CrossValidator: Robust cross-validation",
178
+ "- HyperparameterTuner: Automated tuning",
179
+ "",
180
+ "### Validation",
181
+ "- DataQualityGate: Data quality checks",
182
+ "- LeakageGate: Feature leakage detection",
183
+ "- ModelValidityGate: Model performance validation",
184
+ ""
185
+ ])
186
+ return "\n".join(lines)
187
+
188
+ def build_full_context(self, findings: ExplorationFindings, user_goal: str = "") -> str:
189
+ sections = [
190
+ self.build_exploration_context(findings),
191
+ "---",
192
+ ]
193
+ if user_goal:
194
+ sections.append(f"## User Goal\n{user_goal}\n")
195
+ sections.append("---")
196
+ if self.include_framework_docs:
197
+ sections.append(self.build_framework_docs_context())
198
+ sections.append("---")
199
+ if self.include_databricks:
200
+ sections.append(self.build_databricks_context(findings))
201
+ return "\n\n".join(sections)
@@ -0,0 +1,100 @@
1
+ class PromptTemplates:
2
+ INFER_COLUMN_TYPES = """Given the following column information, infer the semantic type for each column.
3
+
4
+ Available column types:
5
+ - IDENTIFIER: Unique keys, IDs, codes
6
+ - TARGET: The prediction target (binary or multiclass)
7
+ - BINARY: Two-value columns (yes/no, true/false, 0/1)
8
+ - NUMERIC_CONTINUOUS: Continuous numeric values (amounts, measurements)
9
+ - NUMERIC_DISCRETE: Discrete numeric values (counts, ratings)
10
+ - CATEGORICAL_NOMINAL: Categories without order (colors, types)
11
+ - CATEGORICAL_ORDINAL: Categories with order (ratings, levels)
12
+ - CATEGORICAL_CYCLICAL: Cyclical categories (days, months)
13
+ - DATETIME: Date/time values
14
+ - TEXT: Free-form text
15
+
16
+ For each column, provide:
17
+ 1. Inferred type
18
+ 2. Confidence (0-100%)
19
+ 3. Evidence supporting your inference
20
+
21
+ {context}
22
+
23
+ Please analyze each column and provide your type inference."""
24
+
25
+ SUGGEST_TARGET_COLUMN = """Based on the data exploration findings below, suggest the most appropriate target column for a machine learning model.
26
+
27
+ Consider:
28
+ - Column names that suggest outcomes (churn, target, label, outcome, class)
29
+ - Binary or low-cardinality categorical columns
30
+ - Columns that seem to represent what we want to predict
31
+
32
+ {context}
33
+
34
+ Provide:
35
+ 1. Recommended target column
36
+ 2. Confidence level
37
+ 3. Rationale for your choice
38
+ 4. Alternative candidates (if any)"""
39
+
40
+ RECOMMEND_FEATURES = """Based on the data exploration findings, recommend feature engineering opportunities.
41
+
42
+ Consider:
43
+ - Datetime columns: temporal features (year, month, day, day of week, days since)
44
+ - Numeric columns: binning, scaling, log transforms for skewed data
45
+ - Categorical columns: encoding strategies, interaction features
46
+ - Cross-column features: ratios, differences, combinations
47
+
48
+ {context}
49
+
50
+ For each recommendation, provide:
51
+ 1. Source column(s)
52
+ 2. Proposed feature name
53
+ 3. Feature type and computation
54
+ 4. Priority (high/medium/low)
55
+ 5. Implementation hint"""
56
+
57
+ GENERATE_PIPELINE_CONFIG = """Generate a production pipeline configuration based on the exploration findings.
58
+
59
+ The configuration should include:
60
+ - Data source specifications
61
+ - Schema definitions
62
+ - Bronze layer transforms (raw data ingestion)
63
+ - Silver layer transforms (cleaning and standardization)
64
+ - Gold layer transforms (feature engineering)
65
+ - Model configuration
66
+ - Quality gates
67
+
68
+ {context}
69
+
70
+ User Goal: {user_goal}
71
+
72
+ Generate a complete pipeline specification in YAML format."""
73
+
74
+ EXPLAIN_QUALITY_ISSUES = """Explain the data quality issues found in the exploration and provide remediation recommendations.
75
+
76
+ For each issue:
77
+ 1. Describe the problem in business terms
78
+ 2. Explain the potential impact on model performance
79
+ 3. Recommend specific remediation steps
80
+ 4. Prioritize by severity
81
+
82
+ {context}
83
+
84
+ Provide a clear, actionable quality improvement plan."""
85
+
86
+ GENERATE_DLT_CODE = """Generate Databricks Delta Live Tables (DLT) code based on the pipeline specification.
87
+
88
+ Requirements:
89
+ - Use @dlt.table decorators
90
+ - Include expectations for quality checks
91
+ - Follow medallion architecture (bronze/silver/gold)
92
+ - Include proper schema definitions
93
+
94
+ {context}
95
+
96
+ Generate production-ready DLT Python code."""
97
+
98
+ @classmethod
99
+ def format_prompt(cls, template: str, **kwargs) -> str:
100
+ return template.format(**kwargs)
@@ -0,0 +1,103 @@
1
+ from .batch_integration import (
2
+ BatchStreamingBridge,
3
+ ProcessingConfig,
4
+ ProcessingMetrics,
5
+ ProcessingResult,
6
+ ScoreCombinationStrategy,
7
+ ScoreResult,
8
+ StreamProcessor,
9
+ )
10
+ from .early_warning_model import (
11
+ EarlyWarningConfig,
12
+ EarlyWarningModel,
13
+ SignalDetector,
14
+ SignalType,
15
+ WarningLevel,
16
+ WarningResult,
17
+ )
18
+ from .event_schema import (
19
+ BatchValidationResult,
20
+ Event,
21
+ EventSchema,
22
+ EventSource,
23
+ EventType,
24
+ EventValidator,
25
+ SchemaRegistry,
26
+ ValidationResult,
27
+ )
28
+ from .online_store_writer import (
29
+ BatchSyncResult,
30
+ FeatureLookup,
31
+ FeatureRecord,
32
+ FeatureStoreConfig,
33
+ FeatureStoreMetrics,
34
+ FeatureStoreSchema,
35
+ FeatureWriteResult,
36
+ FreshnessMetrics,
37
+ OnlineFeatureStore,
38
+ TTLConfig,
39
+ )
40
+ from .realtime_scorer import (
41
+ AutoScaler,
42
+ EndpointHealth,
43
+ RealtimeScorer,
44
+ RiskFactor,
45
+ ScalingDecision,
46
+ ScalingMetrics,
47
+ ScorerMetrics,
48
+ ScoringConfig,
49
+ ScoringRequest,
50
+ ScoringResponse,
51
+ SLAMetrics,
52
+ )
53
+ from .trigger_engine import (
54
+ ActionType,
55
+ AnomalyTrigger,
56
+ CompositeTrigger,
57
+ PatternTrigger,
58
+ StreamTriggerType,
59
+ ThresholdTrigger,
60
+ TriggerConfig,
61
+ TriggerContext,
62
+ TriggerDefinition,
63
+ TriggerEngine,
64
+ TriggerResult,
65
+ )
66
+ from .window_aggregator import (
67
+ AggregationResult,
68
+ FeatureComputer,
69
+ FeatureComputeResult,
70
+ SessionMetrics,
71
+ SessionWindow,
72
+ SlidingWindow,
73
+ StreamingFeature,
74
+ StreamState,
75
+ TumblingWindow,
76
+ WatermarkConfig,
77
+ Window,
78
+ WindowAggregator,
79
+ WindowType,
80
+ )
81
+
82
+ __all__ = [
83
+ "Event", "EventType", "EventSource", "EventSchema",
84
+ "EventValidator", "ValidationResult", "BatchValidationResult",
85
+ "SchemaRegistry",
86
+ "WindowType", "Window", "TumblingWindow", "SlidingWindow", "SessionWindow",
87
+ "WatermarkConfig", "AggregationResult", "SessionMetrics",
88
+ "WindowAggregator", "StreamState", "StreamingFeature",
89
+ "FeatureComputer", "FeatureComputeResult",
90
+ "FeatureStoreConfig", "TTLConfig", "FeatureRecord", "FeatureWriteResult",
91
+ "BatchSyncResult", "FeatureStoreMetrics", "FreshnessMetrics",
92
+ "FeatureStoreSchema", "OnlineFeatureStore", "FeatureLookup",
93
+ "WarningLevel", "SignalType", "EarlyWarningConfig", "WarningResult",
94
+ "SignalDetector", "EarlyWarningModel",
95
+ "StreamTriggerType", "ActionType", "TriggerConfig", "TriggerContext",
96
+ "TriggerResult", "TriggerDefinition", "ThresholdTrigger",
97
+ "PatternTrigger", "AnomalyTrigger", "CompositeTrigger", "TriggerEngine",
98
+ "ScoringConfig", "ScoringRequest", "ScoringResponse", "RiskFactor",
99
+ "EndpointHealth", "ScalingMetrics", "ScalingDecision", "SLAMetrics",
100
+ "ScorerMetrics", "AutoScaler", "RealtimeScorer",
101
+ "ScoreCombinationStrategy", "ScoreResult", "BatchStreamingBridge",
102
+ "ProcessingConfig", "ProcessingResult", "ProcessingMetrics", "StreamProcessor"
103
+ ]
@@ -0,0 +1,149 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from typing import Dict, List, Optional
5
+
6
+
7
+ class ScoreCombinationStrategy(Enum):
8
+ BATCH_ONLY = "batch_only"
9
+ STREAMING_OVERRIDE = "streaming_override"
10
+ ENSEMBLE = "ensemble"
11
+ MAXIMUM = "maximum"
12
+ SIGNAL_BOOST = "signal_boost"
13
+
14
+
15
+ @dataclass
16
+ class ScoreResult:
17
+ score: float
18
+ source: str
19
+ timestamp: Optional[datetime] = None
20
+
21
+
22
+ class BatchStreamingBridge:
23
+ def __init__(self):
24
+ self._feature_mapping = {
25
+ "days_since_last_order": "minutes_since_last_order",
26
+ "email_engagement_score": ("email_opens_7d", "emails_sent_7d"),
27
+ "order_frequency": "orders_7d"
28
+ }
29
+
30
+ def combine_scores(self, batch_score: Optional[float], streaming_score: Optional[float],
31
+ strategy: ScoreCombinationStrategy = ScoreCombinationStrategy.MAXIMUM,
32
+ weights: Optional[Dict[str, float]] = None,
33
+ batch_timestamp: Optional[datetime] = None,
34
+ streaming_timestamp: Optional[datetime] = None,
35
+ freshness_threshold_hours: int = 1) -> float:
36
+ if streaming_score is None and batch_score is None:
37
+ return 0.0
38
+ if streaming_score is None:
39
+ return batch_score
40
+ if batch_score is None:
41
+ return streaming_score
42
+ if strategy == ScoreCombinationStrategy.BATCH_ONLY:
43
+ return batch_score
44
+ elif strategy == ScoreCombinationStrategy.STREAMING_OVERRIDE:
45
+ if streaming_timestamp and batch_timestamp:
46
+ streaming_age = (datetime.now() - streaming_timestamp).total_seconds() / 3600
47
+ if streaming_age < freshness_threshold_hours:
48
+ return streaming_score
49
+ return streaming_score if streaming_score is not None else batch_score
50
+ elif strategy == ScoreCombinationStrategy.ENSEMBLE:
51
+ w = weights or {"batch": 0.5, "streaming": 0.5}
52
+ return batch_score * w.get("batch", 0.5) + streaming_score * w.get("streaming", 0.5)
53
+ elif strategy == ScoreCombinationStrategy.MAXIMUM:
54
+ return max(batch_score, streaming_score)
55
+ elif strategy == ScoreCombinationStrategy.SIGNAL_BOOST:
56
+ boost = 0.1 if streaming_score > batch_score else 0.0
57
+ return min(batch_score + boost, 1.0)
58
+ return batch_score
59
+
60
+ def map_features(self, batch_features: Dict[str, float], streaming_features: Dict[str, float],
61
+ prefer_streaming_recency: bool = False) -> Dict[str, float]:
62
+ result = batch_features.copy()
63
+ result.update(streaming_features)
64
+ if prefer_streaming_recency and "minutes_since_last_order" in streaming_features:
65
+ result["days_since_last_order"] = streaming_features["minutes_since_last_order"] / (24 * 60)
66
+ return result
67
+
68
+ def get_best_available_score(self, realtime_score: Optional[float] = None,
69
+ streaming_score: Optional[float] = None,
70
+ batch_score: Optional[float] = None,
71
+ cached_score: Optional[float] = None) -> ScoreResult:
72
+ if realtime_score is not None:
73
+ return ScoreResult(score=realtime_score, source="realtime")
74
+ if streaming_score is not None:
75
+ return ScoreResult(score=streaming_score, source="streaming")
76
+ if batch_score is not None:
77
+ return ScoreResult(score=batch_score, source="batch")
78
+ if cached_score is not None:
79
+ return ScoreResult(score=cached_score, source="cached")
80
+ return ScoreResult(score=0.0, source="default")
81
+
82
+
83
+ @dataclass
84
+ class ProcessingConfig:
85
+ checkpoint_interval_seconds: int = 60
86
+ watermark_delay_minutes: int = 10
87
+ trigger_interval_seconds: int = 60
88
+
89
+
90
+ @dataclass
91
+ class ProcessingResult:
92
+ events_processed: int = 0
93
+ features_computed: int = 0
94
+ errors: int = 0
95
+ processing_time_ms: float = 0.0
96
+
97
+
98
+ @dataclass
99
+ class ProcessingMetrics:
100
+ avg_processing_latency_ms: float = 0.0
101
+ events_per_second: float = 0.0
102
+
103
+
104
+ class StreamProcessor:
105
+ def __init__(self, config: Optional[ProcessingConfig] = None):
106
+ self._config = config or ProcessingConfig()
107
+ self._state: Dict[str, Dict[str, float]] = {}
108
+ self._processing_times: List[float] = []
109
+ self._events_processed = 0
110
+ self._start_time = datetime.now()
111
+
112
+ def process_batch(self, events: List) -> ProcessingResult:
113
+ import time
114
+ start = time.time()
115
+ from .window_aggregator import FeatureComputer
116
+ computer = FeatureComputer()
117
+ features_computed = 0
118
+ by_customer: Dict[str, List] = {}
119
+ for event in events:
120
+ cust_id = event.customer_id
121
+ if cust_id not in by_customer:
122
+ by_customer[cust_id] = []
123
+ by_customer[cust_id].append(event)
124
+ for customer_id, customer_events in by_customer.items():
125
+ result = computer.compute_all_features(customer_events, customer_id)
126
+ if customer_id not in self._state:
127
+ self._state[customer_id] = {}
128
+ for feature_name, value in result.features.items():
129
+ self._state[customer_id][feature_name] = self._state[customer_id].get(feature_name, 0) + value
130
+ features_computed += len(result.features)
131
+ elapsed = (time.time() - start) * 1000
132
+ self._processing_times.append(elapsed)
133
+ self._events_processed += len(events)
134
+ return ProcessingResult(
135
+ events_processed=len(events),
136
+ features_computed=features_computed,
137
+ processing_time_ms=elapsed
138
+ )
139
+
140
+ def get_state(self, customer_id: str) -> Dict[str, float]:
141
+ return self._state.get(customer_id, {}).copy()
142
+
143
+ def get_metrics(self) -> ProcessingMetrics:
144
+ import statistics
145
+ elapsed_seconds = max((datetime.now() - self._start_time).total_seconds(), 1)
146
+ return ProcessingMetrics(
147
+ avg_processing_latency_ms=statistics.mean(self._processing_times) if self._processing_times else 0.0,
148
+ events_per_second=self._events_processed / elapsed_seconds
149
+ )