churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,348 @@
1
+ """Feature store stage for notebook generation.
2
+
3
+ This stage generates notebooks that publish features to the feature store
4
+ and retrieve them for training with point-in-time correctness.
5
+ """
6
+
7
+ from typing import List
8
+
9
+ import nbformat
10
+
11
+ from ..base import NotebookStage
12
+ from .base_stage import StageGenerator
13
+
14
+
15
+ class FeatureStoreStage(StageGenerator):
16
+ """Generate feature store integration notebooks."""
17
+
18
+ @property
19
+ def stage(self) -> NotebookStage:
20
+ return NotebookStage.FEATURE_STORE
21
+
22
+ @property
23
+ def title(self) -> str:
24
+ return "11 - Feature Store Integration"
25
+
26
+ @property
27
+ def description(self) -> str:
28
+ return """Publish features to the feature store and create training sets with point-in-time correctness.
29
+
30
+ This notebook:
31
+ 1. Loads features from the gold layer
32
+ 2. Registers feature definitions
33
+ 3. Publishes features to the feature store (Feast or Databricks)
34
+ 4. Creates point-in-time correct training sets
35
+ """
36
+
37
+ def generate_local_cells(self) -> List[nbformat.NotebookNode]:
38
+ """Generate cells for local Feast-based workflow."""
39
+ return self.header_cells() + [
40
+ self.cb.section("1. Setup and Imports"),
41
+ self.cb.code('''import pandas as pd
42
+ import numpy as np
43
+ from pathlib import Path
44
+ from datetime import datetime
45
+
46
+ from customer_retention.integrations.feature_store import (
47
+ FeatureStoreManager,
48
+ FeatureRegistry,
49
+ TemporalFeatureDefinition,
50
+ FeatureComputationType,
51
+ TemporalAggregation,
52
+ )
53
+ from customer_retention.stages.temporal import SnapshotManager
54
+
55
+ print("Feature store imports loaded")'''),
56
+
57
+ self.cb.section("2. Load Gold Layer Data"),
58
+ self.cb.markdown('''Load the gold layer features. These should already have `feature_timestamp` for point-in-time correctness.'''),
59
+ self.cb.code('''# Load gold layer data
60
+ gold_path = Path("./experiments/data/gold/customers_features.parquet")
61
+ if gold_path.exists():
62
+ df = pd.read_parquet(gold_path)
63
+ print(f"Loaded gold layer: {df.shape}")
64
+ else:
65
+ # Fall back to snapshot
66
+ snapshot_manager = SnapshotManager(Path("./experiments/data"))
67
+ latest = snapshot_manager.get_latest_snapshot()
68
+ if latest:
69
+ df, _ = snapshot_manager.load_snapshot(latest)
70
+ print(f"Loaded snapshot {latest}: {df.shape}")
71
+ else:
72
+ raise FileNotFoundError("No gold layer or snapshot found")
73
+
74
+ # Verify temporal columns exist
75
+ required_cols = ["entity_id", "feature_timestamp"]
76
+ missing = [c for c in required_cols if c not in df.columns]
77
+ if missing:
78
+ print(f"Warning: Missing temporal columns: {missing}")
79
+ else:
80
+ print("Temporal columns present")
81
+ '''),
82
+
83
+ self.cb.section("3. Define Feature Registry"),
84
+ self.cb.markdown('''Create feature definitions with temporal metadata. This ensures consistent feature computation across training and inference.'''),
85
+ self.cb.code('''# Create feature registry
86
+ registry = FeatureRegistry()
87
+
88
+ # Get numeric columns (excluding metadata)
89
+ exclude_cols = {"entity_id", "target", "feature_timestamp", "label_timestamp", "label_available_flag"}
90
+ numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in exclude_cols]
91
+
92
+ # Register each numeric feature
93
+ for col in numeric_cols:
94
+ registry.register(TemporalFeatureDefinition(
95
+ name=col,
96
+ description=f"Feature: {col}",
97
+ entity_key="entity_id",
98
+ timestamp_column="feature_timestamp",
99
+ source_columns=[col],
100
+ computation_type=FeatureComputationType.PASSTHROUGH,
101
+ data_type=str(df[col].dtype),
102
+ leakage_risk="low",
103
+ ))
104
+
105
+ print(f"Registered {len(registry)} features")
106
+ print(f"Features: {registry.list_features()[:10]}...") # Show first 10
107
+ '''),
108
+
109
+ self.cb.section("4. Initialize Feature Store Manager"),
110
+ self.cb.code('''# Create feature store manager (uses Feast locally)
111
+ manager = FeatureStoreManager.create(
112
+ backend="feast",
113
+ repo_path="./experiments/feature_store/feature_repo",
114
+ output_path="./experiments/data",
115
+ )
116
+
117
+ print("Feature store manager initialized")
118
+ print(f"Existing tables: {manager.list_tables()}")
119
+ '''),
120
+
121
+ self.cb.section("5. Publish Features to Feature Store"),
122
+ self.cb.code('''# Publish features
123
+ table_name = manager.publish_features(
124
+ df=df,
125
+ registry=registry,
126
+ table_name="customer_features",
127
+ entity_key="entity_id",
128
+ timestamp_column="feature_timestamp",
129
+ mode="overwrite", # Use "merge" for incremental updates
130
+ )
131
+
132
+ print(f"Published features to: {table_name}")
133
+ print(f"Tables after publish: {manager.list_tables()}")
134
+ '''),
135
+
136
+ self.cb.section("6. Create Point-in-Time Training Set"),
137
+ self.cb.markdown('''Create a training set with point-in-time correct feature retrieval. The entity DataFrame specifies when we want to "observe" each customer.'''),
138
+ self.cb.code('''# Create entity DataFrame with observation timestamps
139
+ # This simulates "when would we have made a prediction?"
140
+ entity_df = df[["entity_id", "feature_timestamp"]].copy()
141
+ entity_df = entity_df.rename(columns={"feature_timestamp": "event_timestamp"})
142
+
143
+ # Get point-in-time correct features
144
+ training_df = manager.get_training_features(
145
+ entity_df=entity_df,
146
+ registry=registry,
147
+ feature_names=registry.list_features()[:20], # First 20 features
148
+ table_name="customer_features",
149
+ timestamp_column="event_timestamp",
150
+ )
151
+
152
+ print(f"Training set shape: {training_df.shape}")
153
+ print(f"Columns: {list(training_df.columns)}")
154
+ '''),
155
+
156
+ self.cb.section("7. Save Feature Registry"),
157
+ self.cb.code('''# Save registry for later use
158
+ registry_path = Path("./experiments/feature_store/feature_registry.json")
159
+ registry_path.parent.mkdir(parents=True, exist_ok=True)
160
+ registry.save(registry_path)
161
+ print(f"Saved feature registry to {registry_path}")
162
+
163
+ # Verify we can reload it
164
+ loaded_registry = FeatureRegistry.load(registry_path)
165
+ print(f"Reloaded registry: {len(loaded_registry)} features")
166
+ '''),
167
+
168
+ self.cb.section("8. Validate Feature Store Integration"),
169
+ self.cb.code('''# Validate that features match between direct load and feature store
170
+ direct_features = df[["entity_id"] + registry.list_features()[:5]].head(10)
171
+ store_features = training_df[["entity_id"] + [f for f in registry.list_features()[:5] if f in training_df.columns]].head(10)
172
+
173
+ print("Direct load sample:")
174
+ print(direct_features)
175
+ print("\\nFeature store sample:")
176
+ print(store_features)
177
+
178
+ # Check for mismatches
179
+ if set(direct_features.columns) == set(store_features.columns):
180
+ merged = direct_features.merge(store_features, on="entity_id", suffixes=("_direct", "_store"))
181
+ for col in registry.list_features()[:5]:
182
+ if f"{col}_direct" in merged.columns and f"{col}_store" in merged.columns:
183
+ match = np.allclose(
184
+ merged[f"{col}_direct"].fillna(0),
185
+ merged[f"{col}_store"].fillna(0),
186
+ rtol=1e-5
187
+ )
188
+ print(f" {col}: {'MATCH' if match else 'MISMATCH'}")
189
+ '''),
190
+
191
+ self.cb.section("9. Summary"),
192
+ self.cb.code('''print("=" * 60)
193
+ print("Feature Store Integration Complete")
194
+ print("=" * 60)
195
+ print(f"Features registered: {len(registry)}")
196
+ print(f"Feature table: customer_features")
197
+ print(f"Registry saved: {registry_path}")
198
+ print(f"Training set shape: {training_df.shape}")
199
+ print()
200
+ print("Next steps:")
201
+ print("1. Use the feature store for model training")
202
+ print("2. Use get_inference_features() for online serving")
203
+ print("3. Schedule feature refresh jobs")
204
+ '''),
205
+ ]
206
+
207
+ def generate_databricks_cells(self) -> List[nbformat.NotebookNode]:
208
+ """Generate cells for Databricks Feature Engineering workflow."""
209
+ catalog = self.config.feature_store.catalog
210
+ schema = self.config.feature_store.schema
211
+
212
+ return self.header_cells() + [
213
+ self.cb.section("1. Setup"),
214
+ self.cb.code(f'''from databricks.feature_engineering import FeatureEngineeringClient, FeatureLookup
215
+ from pyspark.sql.functions import col, current_timestamp
216
+ import pandas as pd
217
+
218
+ fe = FeatureEngineeringClient()
219
+ CATALOG = "{catalog}"
220
+ SCHEMA = "{schema}"
221
+
222
+ print(f"Using catalog: {{CATALOG}}.{{SCHEMA}}")
223
+ '''),
224
+
225
+ self.cb.section("2. Load Gold Table"),
226
+ self.cb.code(f'''# Load gold layer
227
+ df = spark.table("{catalog}.{schema}.gold_customers")
228
+ print(f"Loaded gold table: {{df.count()}} rows")
229
+
230
+ # Display schema
231
+ df.printSchema()
232
+ '''),
233
+
234
+ self.cb.section("3. Create Feature Table"),
235
+ self.cb.markdown('''Create a Unity Catalog feature table with primary keys and timestamp column for point-in-time lookups.'''),
236
+ self.cb.code(f'''# Define feature table
237
+ FEATURE_TABLE = "{catalog}.{schema}.customer_features"
238
+
239
+ # Select feature columns (exclude metadata)
240
+ exclude_cols = {{"entity_id", "target", "feature_timestamp", "label_timestamp", "label_available_flag"}}
241
+ feature_cols = [c for c in df.columns if c not in exclude_cols]
242
+
243
+ # Create or replace feature table
244
+ feature_df = df.select(
245
+ "entity_id",
246
+ "feature_timestamp",
247
+ *feature_cols
248
+ )
249
+
250
+ fe.create_table(
251
+ name=FEATURE_TABLE,
252
+ primary_keys=["entity_id"],
253
+ timestamp_keys=["feature_timestamp"],
254
+ df=feature_df,
255
+ description="Customer features for churn prediction",
256
+ )
257
+
258
+ print(f"Created feature table: {{FEATURE_TABLE}}")
259
+ '''),
260
+
261
+ self.cb.section("4. Create Training Set with Point-in-Time Lookups"),
262
+ self.cb.code('''# Create entity DataFrame with observation timestamps
263
+ entity_df = df.select("entity_id", col("feature_timestamp").alias("event_timestamp"))
264
+
265
+ # Define feature lookups with timestamp_lookup_key for PIT correctness
266
+ feature_lookups = [
267
+ FeatureLookup(
268
+ table_name=FEATURE_TABLE,
269
+ lookup_key=["entity_id"],
270
+ timestamp_lookup_key="event_timestamp",
271
+ )
272
+ ]
273
+
274
+ # Create training set
275
+ training_set = fe.create_training_set(
276
+ df=entity_df,
277
+ feature_lookups=feature_lookups,
278
+ label=None, # Add label column name if joining labels
279
+ )
280
+
281
+ training_df = training_set.load_df()
282
+ print(f"Training set: {training_df.count()} rows, {len(training_df.columns)} columns")
283
+ training_df.show(5)
284
+ '''),
285
+
286
+ self.cb.section("5. Log Model with Feature Store Lineage"),
287
+ self.cb.markdown('''When training models, use `fe.log_model()` to capture feature lineage. This enables automatic feature lookup during inference.'''),
288
+ self.cb.code('''# Example: Train and log model with feature lineage
289
+ # (Uncomment and modify for your model)
290
+
291
+ # from sklearn.ensemble import RandomForestClassifier
292
+ # import mlflow
293
+ #
294
+ # # Prepare features
295
+ # pdf = training_df.toPandas()
296
+ # feature_cols = [c for c in pdf.columns if c not in ["entity_id", "event_timestamp", "target"]]
297
+ # X = pdf[feature_cols]
298
+ # y = pdf["target"]
299
+ #
300
+ # # Train model
301
+ # model = RandomForestClassifier(n_estimators=100)
302
+ # model.fit(X, y)
303
+ #
304
+ # # Log with feature lineage
305
+ # with mlflow.start_run():
306
+ # fe.log_model(
307
+ # model=model,
308
+ # artifact_path="model",
309
+ # flavor=mlflow.sklearn,
310
+ # training_set=training_set,
311
+ # )
312
+ # print("Model logged with feature lineage")
313
+ '''),
314
+
315
+ self.cb.section("6. Online Feature Serving"),
316
+ self.cb.markdown('''For real-time inference, use Model Serving with automatic feature lookup.'''),
317
+ self.cb.code('''# Score batch with automatic feature lookup
318
+ # The model automatically retrieves latest features from the feature table
319
+
320
+ # Example: Score new customers
321
+ # new_customers = spark.createDataFrame([
322
+ # {"entity_id": "new_customer_1"},
323
+ # {"entity_id": "new_customer_2"},
324
+ # ])
325
+ #
326
+ # # Feature lookups happen automatically during scoring
327
+ # predictions = fe.score_batch(
328
+ # df=new_customers,
329
+ # model_uri="models:/churn_model/production",
330
+ # )
331
+ # predictions.show()
332
+ '''),
333
+
334
+ self.cb.section("7. Summary"),
335
+ self.cb.code('''print("=" * 60)
336
+ print("Databricks Feature Store Integration Complete")
337
+ print("=" * 60)
338
+ print(f"Feature table: {FEATURE_TABLE}")
339
+ print(f"Primary key: entity_id")
340
+ print(f"Timestamp key: feature_timestamp")
341
+ print(f"Training set rows: {training_df.count()}")
342
+ print()
343
+ print("Next steps:")
344
+ print("1. Train model using training_set")
345
+ print("2. Log model with fe.log_model() for lineage")
346
+ print("3. Deploy to Model Serving for auto feature lookup")
347
+ '''),
348
+ ]
@@ -0,0 +1,23 @@
1
+ # Import context first to avoid circular imports
2
+ from .context import ContextManager, PipelineContext, setup_notebook_context
3
+ from .data_materializer import DataMaterializer
4
+ from .databricks_exporter import DatabricksExporter
5
+ from .doc_generator import PipelineDocGenerator
6
+
7
+
8
+ # Deferred import - code_generator has heavy dependencies
9
+ def __getattr__(name):
10
+ if name == "PipelineCodeGenerator":
11
+ from .code_generator import PipelineCodeGenerator
12
+ return PipelineCodeGenerator
13
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
14
+
15
+ __all__ = [
16
+ "PipelineContext",
17
+ "ContextManager",
18
+ "setup_notebook_context",
19
+ "PipelineCodeGenerator",
20
+ "PipelineDocGenerator",
21
+ "DataMaterializer",
22
+ "DatabricksExporter",
23
+ ]
@@ -0,0 +1,196 @@
1
+ from typing import List
2
+
3
+ from customer_retention.analysis.auto_explorer.layered_recommendations import (
4
+ LayeredRecommendation,
5
+ RecommendationRegistry,
6
+ )
7
+
8
+
9
+ class PipelineCodeGenerator:
10
+ def __init__(self, registry: RecommendationRegistry):
11
+ self.registry = registry
12
+
13
+ def generate_bronze_code(self) -> str:
14
+ lines = [
15
+ "from customer_retention.stages.cleaning import MissingValueHandler, OutlierHandler",
16
+ "",
17
+ "",
18
+ "def bronze_transform(df):",
19
+ ]
20
+ if not self.registry.bronze or not self.registry.bronze.all_recommendations:
21
+ lines.append(" return df")
22
+ return "\n".join(lines)
23
+
24
+ for rec in self.registry.bronze.null_handling:
25
+ lines.extend(self._generate_null_handling(rec))
26
+ for rec in self.registry.bronze.outlier_handling:
27
+ lines.extend(self._generate_outlier_handling(rec))
28
+ for rec in self.registry.bronze.type_conversions:
29
+ lines.extend(self._generate_type_conversion(rec))
30
+ for rec in self.registry.bronze.filtering:
31
+ lines.extend(self._generate_filtering(rec))
32
+
33
+ lines.append(" return df")
34
+ return "\n".join(lines)
35
+
36
+ def generate_silver_code(self) -> str:
37
+ lines = ["", "", "def silver_transform(df):"]
38
+ if not self.registry.silver or not self.registry.silver.all_recommendations:
39
+ lines.append(" return df")
40
+ return "\n".join(lines)
41
+
42
+ entity_col = self.registry.silver.entity_column
43
+ time_col = self.registry.silver.time_column
44
+
45
+ for rec in self.registry.silver.aggregations:
46
+ lines.extend(self._generate_aggregation(rec, entity_col, time_col))
47
+ for rec in self.registry.silver.derived_columns:
48
+ lines.extend(self._generate_derived(rec))
49
+ for rec in self.registry.silver.joins:
50
+ lines.extend(self._generate_join(rec))
51
+
52
+ lines.append(" return df")
53
+ return "\n".join(lines)
54
+
55
+ def generate_gold_code(self) -> str:
56
+ lines = [
57
+ "from sklearn.preprocessing import StandardScaler, RobustScaler",
58
+ "import numpy as np",
59
+ "",
60
+ "",
61
+ "def gold_transform(df):",
62
+ ]
63
+ if not self.registry.gold or not self.registry.gold.all_recommendations:
64
+ lines.append(" return df")
65
+ return "\n".join(lines)
66
+
67
+ for rec in self.registry.gold.transformations:
68
+ lines.extend(self._generate_transformation(rec))
69
+ for rec in self.registry.gold.encoding:
70
+ lines.extend(self._generate_encoding(rec))
71
+ for rec in self.registry.gold.scaling:
72
+ lines.extend(self._generate_scaling(rec))
73
+
74
+ lines.append(" return df")
75
+ return "\n".join(lines)
76
+
77
+ def generate_full_pipeline(self) -> str:
78
+ bronze = self.generate_bronze_code()
79
+ silver = self.generate_silver_code()
80
+ gold = self.generate_gold_code()
81
+ main = self._generate_main_function()
82
+ return f"{bronze}\n{silver}\n{gold}\n{main}"
83
+
84
+ def _generate_null_handling(self, rec: LayeredRecommendation) -> List[str]:
85
+ strategy = rec.parameters.get("strategy", "median")
86
+ return [
87
+ f" # {rec.rationale}",
88
+ f" handler = MissingValueHandler(strategy='{strategy}')",
89
+ f" df = handler.fit_transform(df, columns=['{rec.target_column}'])",
90
+ "",
91
+ ]
92
+
93
+ def _generate_outlier_handling(self, rec: LayeredRecommendation) -> List[str]:
94
+ method = rec.parameters.get("method", "iqr")
95
+ factor = rec.parameters.get("factor", 1.5)
96
+ return [
97
+ f" # {rec.rationale}",
98
+ f" outlier_handler = OutlierHandler(method='{method}', factor={factor})",
99
+ f" df['{rec.target_column}'] = outlier_handler.fit_transform(df[['{rec.target_column}']])",
100
+ "",
101
+ ]
102
+
103
+ def _generate_type_conversion(self, rec: LayeredRecommendation) -> List[str]:
104
+ target_type = rec.parameters.get("target_type", "str")
105
+ return [
106
+ f" # {rec.rationale}",
107
+ f" df['{rec.target_column}'] = df['{rec.target_column}'].astype('{target_type}')",
108
+ "",
109
+ ]
110
+
111
+ def _generate_filtering(self, rec: LayeredRecommendation) -> List[str]:
112
+ if rec.action == "drop":
113
+ return [
114
+ f" # {rec.rationale}",
115
+ f" df = df.drop(columns=['{rec.target_column}'])",
116
+ "",
117
+ ]
118
+ return []
119
+
120
+ def _generate_aggregation(self, rec: LayeredRecommendation, entity_col: str, time_col: str) -> List[str]:
121
+ agg = rec.parameters.get("aggregation", "sum")
122
+ windows = rec.parameters.get("windows", ["7d"])
123
+ col = rec.target_column
124
+ lines = [f" # {rec.rationale}"]
125
+ for window in windows:
126
+ feature_name = f"{col}_{agg}_{window}"
127
+ lines.append(f" df['{feature_name}'] = df.groupby('{entity_col}')['{col}'].transform('{agg}')")
128
+ lines.append("")
129
+ return lines
130
+
131
+ def _generate_derived(self, rec: LayeredRecommendation) -> List[str]:
132
+ formula = rec.parameters.get("formula", "")
133
+ return [
134
+ f" # {rec.rationale}",
135
+ f" df['{rec.target_column}'] = {formula} # TODO: adapt formula",
136
+ "",
137
+ ]
138
+
139
+ def _generate_join(self, rec: LayeredRecommendation) -> List[str]:
140
+ dataset = rec.parameters.get("dataset", "")
141
+ join_type = rec.parameters.get("join_type", "left")
142
+ return [
143
+ f" # {rec.rationale}",
144
+ f" # df = df.merge(load('{dataset}'), on='{rec.target_column}', how='{join_type}')",
145
+ "",
146
+ ]
147
+
148
+ def _generate_encoding(self, rec: LayeredRecommendation) -> List[str]:
149
+ method = rec.parameters.get("method", "one_hot")
150
+ col = rec.target_column
151
+ if method == "one_hot":
152
+ drop_first = rec.parameters.get("drop_first", False)
153
+ return [
154
+ f" # {rec.rationale}",
155
+ f" df = pd.get_dummies(df, columns=['{col}'], drop_first={drop_first})",
156
+ "",
157
+ ]
158
+ elif method == "target":
159
+ return [
160
+ f" # {rec.rationale} - target encoding",
161
+ f" # Use TargetEncoder for '{col}'",
162
+ "",
163
+ ]
164
+ return []
165
+
166
+ def _generate_scaling(self, rec: LayeredRecommendation) -> List[str]:
167
+ method = rec.parameters.get("method", "standard")
168
+ col = rec.target_column
169
+ scaler = "StandardScaler" if method == "standard" else "RobustScaler"
170
+ return [
171
+ f" # {rec.rationale}",
172
+ f" scaler = {scaler}()",
173
+ f" df['{col}'] = scaler.fit_transform(df[['{col}']])",
174
+ "",
175
+ ]
176
+
177
+ def _generate_transformation(self, rec: LayeredRecommendation) -> List[str]:
178
+ method = rec.parameters.get("method", "log")
179
+ col = rec.target_column
180
+ if method == "log":
181
+ return [
182
+ f" # {rec.rationale}",
183
+ f" df['{col}'] = np.log1p(df['{col}'])",
184
+ "",
185
+ ]
186
+ return []
187
+
188
+ def _generate_main_function(self) -> str:
189
+ return """
190
+
191
+ def run_pipeline(df):
192
+ df = bronze_transform(df)
193
+ df = silver_transform(df)
194
+ df = gold_transform(df)
195
+ return df
196
+ """