churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,1463 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "cell-0",
6
+ "metadata": {
7
+ "papermill": {
8
+ "duration": 0.005093,
9
+ "end_time": "2026-02-02T13:02:48.607922",
10
+ "exception": false,
11
+ "start_time": "2026-02-02T13:02:48.602829",
12
+ "status": "completed"
13
+ },
14
+ "tags": []
15
+ },
16
+ "source": [
17
+ "# Chapter 1d: Event Aggregation (Event Bronze Track → Entity Bronze Track)\n",
18
+ "\n",
19
+ "**Purpose:** Aggregate event-level data to entity-level, applying all insights from 01a-01c.\n",
20
+ "\n",
21
+ "**When to use this notebook:**\n",
22
+ "- After completing 01a (temporal profiling), 01b (quality checks), 01c (pattern analysis)\n",
23
+ "- Your dataset is EVENT_LEVEL granularity\n",
24
+ "- You want to create entity-level features informed by temporal patterns\n",
25
+ "\n",
26
+ "**What this notebook produces:**\n",
27
+ "- Aggregated parquet file (one row per entity)\n",
28
+ "- New findings file for the aggregated data\n",
29
+ "- Updated original findings with aggregation metadata\n",
30
+ "\n",
31
+ "**How 01a-01c findings inform aggregation:**\n",
32
+ "\n",
33
+ "| Source | Insight Applied |\n",
34
+ "|--------|----------------|\n",
35
+ "| **01a** | Recommended windows (e.g., 180d, 365d), lifecycle quadrant feature |\n",
36
+ "| **01b** | Quality issues to handle (gaps, duplicates) |\n",
37
+ "| **01c** | Divergent columns for velocity/momentum (prioritize these features) |\n",
38
+ "\n",
39
+ "---\n",
40
+ "\n",
41
+ "## Understanding the Shape Transformation\n",
42
+ "\n",
43
+ "```\n",
44
+ "EVENT-LEVEL (input) ENTITY-LEVEL (output)\n",
45
+ "┌─────────────────────┐ ┌─────────────────────────────────────┐\n",
46
+ "│ customer │ date │ │ customer │ events_180d │ quadrant │ ...\n",
47
+ "├──────────┼──────────┤ → ├──────────┼─────────────┼──────────┤\n",
48
+ "│ A │ Jan 1 │ │ A │ 12 │ Steady │\n",
49
+ "│ A │ Jan 5 │ │ B │ 5 │ Brief │\n",
50
+ "│ A │ Jan 10 │ │ C │ 2 │ Loyal │\n",
51
+ "│ B │ Jan 3 │ └──────────┴─────────────┴──────────┘\n",
52
+ "│ ... │ ... │\n",
53
+ "└──────────┴──────────┘\n",
54
+ "Many rows per entity One row per entity + lifecycle features\n",
55
+ "```"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "markdown",
60
+ "id": "cell-1",
61
+ "metadata": {
62
+ "papermill": {
63
+ "duration": 0.003368,
64
+ "end_time": "2026-02-02T13:02:48.616108",
65
+ "exception": false,
66
+ "start_time": "2026-02-02T13:02:48.612740",
67
+ "status": "completed"
68
+ },
69
+ "tags": []
70
+ },
71
+ "source": [
72
+ "## 1d.1 Load Findings and Data"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "id": "cell-2",
79
+ "metadata": {
80
+ "execution": {
81
+ "iopub.execute_input": "2026-02-02T13:02:48.622485Z",
82
+ "iopub.status.busy": "2026-02-02T13:02:48.622374Z",
83
+ "iopub.status.idle": "2026-02-02T13:02:50.408234Z",
84
+ "shell.execute_reply": "2026-02-02T13:02:50.407558Z"
85
+ },
86
+ "papermill": {
87
+ "duration": 1.790201,
88
+ "end_time": "2026-02-02T13:02:50.409167",
89
+ "exception": false,
90
+ "start_time": "2026-02-02T13:02:48.618966",
91
+ "status": "completed"
92
+ },
93
+ "tags": []
94
+ },
95
+ "outputs": [],
96
+ "source": [
97
+ "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
98
+ "track_and_export_previous(\"01d_event_aggregation.ipynb\")\n",
99
+ "\n",
100
+ "from customer_retention.analysis.auto_explorer import ExplorationFindings, DataExplorer\n",
101
+ "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
102
+ "from customer_retention.core.config.column_config import ColumnType, DatasetGranularity\n",
103
+ "from customer_retention.stages.profiling import (\n",
104
+ " AggregationFeatureConfig,\n",
105
+ " TimeWindowAggregator,\n",
106
+ " TimeSeriesProfiler,\n",
107
+ " classify_lifecycle_quadrants,\n",
108
+ " classify_activity_segments,\n",
109
+ " create_momentum_ratio_features,\n",
110
+ " create_recency_bucket_feature,\n",
111
+ " deduplicate_events,\n",
112
+ " get_duplicate_event_count,\n",
113
+ ")\n",
114
+ "from datetime import datetime\n",
115
+ "from pathlib import Path\n",
116
+ "import pandas as pd\n",
117
+ "import numpy as np\n",
118
+ "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": null,
124
+ "id": "cell-3",
125
+ "metadata": {
126
+ "execution": {
127
+ "iopub.execute_input": "2026-02-02T13:02:50.414489Z",
128
+ "iopub.status.busy": "2026-02-02T13:02:50.414348Z",
129
+ "iopub.status.idle": "2026-02-02T13:02:50.451903Z",
130
+ "shell.execute_reply": "2026-02-02T13:02:50.451157Z"
131
+ },
132
+ "papermill": {
133
+ "duration": 0.041484,
134
+ "end_time": "2026-02-02T13:02:50.452927",
135
+ "exception": false,
136
+ "start_time": "2026-02-02T13:02:50.411443",
137
+ "status": "completed"
138
+ },
139
+ "tags": []
140
+ },
141
+ "outputs": [],
142
+ "source": [
143
+ "# === CONFIGURATION ===\n",
144
+ "# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
145
+ "\n",
146
+ "# Find findings files (exclude multi_dataset and already-aggregated)\n",
147
+ "findings_files = [\n",
148
+ " f for f in FINDINGS_DIR.glob(\"*_findings.yaml\") \n",
149
+ " if \"multi_dataset\" not in f.name and \"_aggregated\" not in f.name\n",
150
+ "]\n",
151
+ "if not findings_files:\n",
152
+ " raise FileNotFoundError(f\"No findings files found in {FINDINGS_DIR}. Run notebook 01 first.\")\n",
153
+ "\n",
154
+ "findings_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
155
+ "FINDINGS_PATH = str(findings_files[0])\n",
156
+ "\n",
157
+ "print(f\"Using: {FINDINGS_PATH}\")\n",
158
+ "findings = ExplorationFindings.load(FINDINGS_PATH)\n",
159
+ "print(f\"Loaded findings for {findings.column_count} columns from {findings.source_path}\")"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": null,
165
+ "id": "cell-4",
166
+ "metadata": {
167
+ "execution": {
168
+ "iopub.execute_input": "2026-02-02T13:02:50.458673Z",
169
+ "iopub.status.busy": "2026-02-02T13:02:50.458499Z",
170
+ "iopub.status.idle": "2026-02-02T13:02:50.468331Z",
171
+ "shell.execute_reply": "2026-02-02T13:02:50.467753Z"
172
+ },
173
+ "papermill": {
174
+ "duration": 0.013426,
175
+ "end_time": "2026-02-02T13:02:50.468887",
176
+ "exception": false,
177
+ "start_time": "2026-02-02T13:02:50.455461",
178
+ "status": "completed"
179
+ },
180
+ "tags": []
181
+ },
182
+ "outputs": [],
183
+ "source": [
184
+ "# Verify this is event-level data and display findings summary\n",
185
+ "if not findings.is_time_series:\n",
186
+ " print(\"⚠️ This dataset is NOT event-level. Aggregation not needed.\")\n",
187
+ " print(\" Proceed directly to 02_column_deep_dive.ipynb\")\n",
188
+ " raise SystemExit(\"Skipping aggregation - data is already entity-level\")\n",
189
+ "\n",
190
+ "ts_meta = findings.time_series_metadata\n",
191
+ "ENTITY_COLUMN = ts_meta.entity_column\n",
192
+ "TIME_COLUMN = ts_meta.time_column\n",
193
+ "\n",
194
+ "print(\"=\" * 70)\n",
195
+ "print(\"FINDINGS SUMMARY FROM 01a-01c\")\n",
196
+ "print(\"=\" * 70)\n",
197
+ "\n",
198
+ "# === 01a: Time Series Metadata ===\n",
199
+ "print(\"\\n📊 FROM 01a (Temporal Profiling):\")\n",
200
+ "print(f\" Entity column: {ENTITY_COLUMN}\")\n",
201
+ "print(f\" Time column: {TIME_COLUMN}\")\n",
202
+ "if ts_meta.unique_entities:\n",
203
+ " print(f\" Unique entities: {ts_meta.unique_entities:,}\")\n",
204
+ "if ts_meta.avg_events_per_entity:\n",
205
+ " print(f\" Avg events/entity: {ts_meta.avg_events_per_entity:.1f}\")\n",
206
+ "if ts_meta.time_span_days:\n",
207
+ " print(f\" Time span: {ts_meta.time_span_days:,} days\")\n",
208
+ "\n",
209
+ "if ts_meta.suggested_aggregations:\n",
210
+ " print(f\"\\n ✅ Recommended windows: {ts_meta.suggested_aggregations}\")\n",
211
+ "else:\n",
212
+ " print(\"\\n ⚠️ No window recommendations - will use defaults\")\n",
213
+ "\n",
214
+ "if ts_meta.temporal_segmentation_recommendation:\n",
215
+ " print(f\"\\n 📋 Segmentation recommendation:\")\n",
216
+ " print(f\" {ts_meta.temporal_segmentation_recommendation}\")\n",
217
+ " if ts_meta.heterogeneity_level:\n",
218
+ " print(f\" Heterogeneity: {ts_meta.heterogeneity_level}\")\n",
219
+ "\n",
220
+ "if ts_meta.drift_risk_level:\n",
221
+ " print(f\"\\n ⚠️ Drift risk: {ts_meta.drift_risk_level.upper()}\")\n",
222
+ " if ts_meta.volume_drift_risk:\n",
223
+ " print(f\" Volume drift: {ts_meta.volume_drift_risk}\")\n",
224
+ " if ts_meta.population_stability is not None:\n",
225
+ " print(f\" Population stability: {ts_meta.population_stability:.2f}\")\n",
226
+ "\n",
227
+ "# === 01b: Temporal Quality ===\n",
228
+ "quality_meta = findings.metadata.get(\"temporal_quality\", {})\n",
229
+ "if quality_meta:\n",
230
+ " print(f\"\\n📋 FROM 01b (Temporal Quality):\")\n",
231
+ " if quality_meta.get(\"temporal_quality_score\"):\n",
232
+ " print(f\" Quality score: {quality_meta.get('temporal_quality_score'):.1f}\")\n",
233
+ " if quality_meta.get(\"temporal_quality_grade\"):\n",
234
+ " print(f\" Quality grade: {quality_meta.get('temporal_quality_grade')}\")\n",
235
+ " issues = quality_meta.get(\"issues\", {})\n",
236
+ " if issues.get(\"duplicate_events\", 0) > 0:\n",
237
+ " print(f\" ⚠️ Duplicate events: {issues['duplicate_events']:,}\")\n",
238
+ " if issues.get(\"temporal_gaps\", 0) > 0:\n",
239
+ " print(f\" ⚠️ Temporal gaps: {issues['temporal_gaps']:,}\")\n",
240
+ "\n",
241
+ "# === 01c: Temporal Patterns ===\n",
242
+ "pattern_meta = findings.metadata.get(\"temporal_patterns\", {})\n",
243
+ "SEASONALITY_RECOMMENDATIONS = [] # Store for later application\n",
244
+ "TEMPORAL_PATTERN_RECOMMENDATIONS = [] # Store for later application\n",
245
+ "TREND_RECOMMENDATIONS = [] # Store for later application\n",
246
+ "COHORT_RECOMMENDATIONS = [] # Store for later application\n",
247
+ "\n",
248
+ "if pattern_meta:\n",
249
+ " print(f\"\\n📈 FROM 01c (Temporal Patterns):\")\n",
250
+ " windows_used = pattern_meta.get(\"windows_used\", {})\n",
251
+ " if windows_used:\n",
252
+ " if windows_used.get(\"aggregation_windows\"):\n",
253
+ " print(f\" Windows analyzed: {windows_used.get('aggregation_windows')}\")\n",
254
+ " if windows_used.get(\"velocity_window\"):\n",
255
+ " print(f\" Velocity window: {windows_used.get('velocity_window')} days\")\n",
256
+ " if windows_used.get(\"momentum_pairs\"):\n",
257
+ " print(f\" Momentum pairs: {windows_used.get('momentum_pairs')}\")\n",
258
+ " \n",
259
+ " trend = pattern_meta.get(\"trend\", {})\n",
260
+ " if trend and trend.get(\"direction\"):\n",
261
+ " print(f\"\\n Trend: {trend.get('direction')} (strength: {trend.get('strength', 0):.2f})\")\n",
262
+ " TREND_RECOMMENDATIONS = trend.get(\"recommendations\", [])\n",
263
+ " trend_features = [r for r in TREND_RECOMMENDATIONS if r.get(\"features\")]\n",
264
+ " if trend_features:\n",
265
+ " print(f\"\\n 📈 Trend Features to Add:\")\n",
266
+ " for rec in trend_features:\n",
267
+ " print(f\" → {', '.join(rec['features'])} ({rec['priority']} priority)\")\n",
268
+ " \n",
269
+ " # Handle both old format (list) and new format (dict with patterns and recommendations)\n",
270
+ " seasonality = pattern_meta.get(\"seasonality\", {})\n",
271
+ " if isinstance(seasonality, list):\n",
272
+ " patterns = seasonality\n",
273
+ " SEASONALITY_RECOMMENDATIONS = []\n",
274
+ " else:\n",
275
+ " patterns = seasonality.get(\"patterns\", [])\n",
276
+ " SEASONALITY_RECOMMENDATIONS = seasonality.get(\"recommendations\", [])\n",
277
+ " \n",
278
+ " if patterns:\n",
279
+ " periods = [f\"{s.get('name', 'period')} ({s.get('period')}d)\" for s in patterns[:3]]\n",
280
+ " print(f\" Seasonality: {', '.join(periods)}\")\n",
281
+ " \n",
282
+ " # Display seasonality recommendations\n",
283
+ " if SEASONALITY_RECOMMENDATIONS:\n",
284
+ " print(f\"\\n 📋 Seasonality Recommendations:\")\n",
285
+ " for rec in SEASONALITY_RECOMMENDATIONS:\n",
286
+ " action = rec.get(\"action\", \"\").replace(\"_\", \" \")\n",
287
+ " if action == \"add cyclical feature\":\n",
288
+ " print(f\" → Add {rec.get('feature')} with {rec.get('encoding')} encoding\")\n",
289
+ " elif action == \"window captures cycle\":\n",
290
+ " print(f\" → Windows {rec.get('windows')} align with detected cycles ✓\")\n",
291
+ " elif action == \"window partial cycle\":\n",
292
+ " print(f\" → Warning: Windows don't align with cycles {rec.get('detected_periods')}\")\n",
293
+ " elif action == \"consider deseasonalization\":\n",
294
+ " print(f\" → Consider deseasonalizing for periods {rec.get('periods')}\")\n",
295
+ " \n",
296
+ " recency = pattern_meta.get(\"recency\", {})\n",
297
+ " if recency and recency.get(\"median_days\"):\n",
298
+ " print(f\" Recency: median={recency.get('median_days'):.0f} days, \"\n",
299
+ " f\"target_corr={recency.get('target_correlation', 0):.2f}\")\n",
300
+ " \n",
301
+ " # Divergent columns (important for feature prioritization)\n",
302
+ " velocity = pattern_meta.get(\"velocity\", {})\n",
303
+ " divergent_velocity = [k for k, v in velocity.items() if isinstance(v, dict) and v.get(\"divergent\")]\n",
304
+ " if divergent_velocity:\n",
305
+ " print(f\"\\n 🎯 Divergent velocity columns: {divergent_velocity}\")\n",
306
+ " \n",
307
+ " momentum = pattern_meta.get(\"momentum\", {})\n",
308
+ " divergent_momentum = momentum.get(\"_divergent_columns\", [])\n",
309
+ " if divergent_momentum:\n",
310
+ " print(f\" 🎯 Divergent momentum columns: {divergent_momentum}\")\n",
311
+ "\n",
312
+ " # Extract cohort recommendations\n",
313
+ " cohort_meta = pattern_meta.get(\"cohort\", {})\n",
314
+ " if cohort_meta:\n",
315
+ " COHORT_RECOMMENDATIONS = cohort_meta.get(\"recommendations\", [])\n",
316
+ " skip_cohort = any(r.get(\"action\") == \"skip_cohort_features\" for r in COHORT_RECOMMENDATIONS)\n",
317
+ " if skip_cohort:\n",
318
+ " skip_rec = next(r for r in COHORT_RECOMMENDATIONS if r.get(\"action\") == \"skip_cohort_features\")\n",
319
+ " print(f\"\\n 👥 Cohort: Skip features - {skip_rec.get('reason', 'insufficient variation')}\")\n",
320
+ " else:\n",
321
+ " cohort_features = [r for r in COHORT_RECOMMENDATIONS if r.get(\"features\")]\n",
322
+ " if cohort_features:\n",
323
+ " print(f\"\\n 👥 Cohort Features to Add:\")\n",
324
+ " for rec in cohort_features:\n",
325
+ " print(f\" → {', '.join(rec['features'])} ({rec['priority']} priority)\")\n",
326
+ "\n",
327
+ "print(\"\\n\" + \"=\" * 70)\n",
328
+ "\n",
329
+ "# Validate that prior notebooks have been run (01a required, 01c recommended)\n",
330
+ "from customer_retention.stages.profiling import validate_temporal_findings\n",
331
+ "\n",
332
+ "validation = validate_temporal_findings(findings)\n",
333
+ "if not validation.valid:\n",
334
+ " print(\"\\n\" + \"=\" * 70)\n",
335
+ " print(\"⛔ MISSING REQUIRED ANALYSIS\")\n",
336
+ " print(\"=\" * 70)\n",
337
+ " for m in validation.missing_sections:\n",
338
+ " print(f\" - {m}\")\n",
339
+ " raise ValueError(\"Cannot proceed - run prior notebooks first\")\n",
340
+ "if validation.warnings:\n",
341
+ " print(\"\\n⚠️ VALIDATION WARNINGS:\")\n",
342
+ " for w in validation.warnings:\n",
343
+ " print(f\" - {w}\")"
344
+ ]
345
+ },
346
+ {
347
+ "cell_type": "code",
348
+ "execution_count": null,
349
+ "id": "cell-5",
350
+ "metadata": {
351
+ "execution": {
352
+ "iopub.execute_input": "2026-02-02T13:02:50.474332Z",
353
+ "iopub.status.busy": "2026-02-02T13:02:50.474223Z",
354
+ "iopub.status.idle": "2026-02-02T13:02:51.176954Z",
355
+ "shell.execute_reply": "2026-02-02T13:02:51.176285Z"
356
+ },
357
+ "papermill": {
358
+ "duration": 0.706334,
359
+ "end_time": "2026-02-02T13:02:51.177794",
360
+ "exception": false,
361
+ "start_time": "2026-02-02T13:02:50.471460",
362
+ "status": "completed"
363
+ },
364
+ "tags": []
365
+ },
366
+ "outputs": [],
367
+ "source": [
368
+ "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
369
+ "\n",
370
+ "# Load source data (prefers snapshots over raw files)\n",
371
+ "df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
372
+ "df[TIME_COLUMN] = pd.to_datetime(df[TIME_COLUMN])\n",
373
+ "charts = ChartBuilder()\n",
374
+ "\n",
375
+ "print(f\"Loaded {len(df):,} events x {len(df.columns)} columns\")\n",
376
+ "print(f\"Data source: {data_source}\")\n",
377
+ "print(f\"Date range: {df[TIME_COLUMN].min()} to {df[TIME_COLUMN].max()}\")"
378
+ ]
379
+ },
380
+ {
381
+ "cell_type": "code",
382
+ "execution_count": null,
383
+ "id": "swzzbsxq5e",
384
+ "metadata": {
385
+ "execution": {
386
+ "iopub.execute_input": "2026-02-02T13:02:51.183797Z",
387
+ "iopub.status.busy": "2026-02-02T13:02:51.183536Z",
388
+ "iopub.status.idle": "2026-02-02T13:02:51.196381Z",
389
+ "shell.execute_reply": "2026-02-02T13:02:51.195597Z"
390
+ },
391
+ "papermill": {
392
+ "duration": 0.01637,
393
+ "end_time": "2026-02-02T13:02:51.197095",
394
+ "exception": false,
395
+ "start_time": "2026-02-02T13:02:51.180725",
396
+ "status": "completed"
397
+ },
398
+ "tags": []
399
+ },
400
+ "outputs": [],
401
+ "source": [
402
+ "# Apply quality deduplication from 01b findings\n",
403
+ "dup_count = get_duplicate_event_count(findings)\n",
404
+ "if dup_count > 0:\n",
405
+ " df, removed = deduplicate_events(df, ENTITY_COLUMN, TIME_COLUMN, duplicate_count=dup_count)\n",
406
+ " print(f\"Deduplication: removed {removed:,} duplicate events (01b flagged {dup_count:,})\")\n",
407
+ " print(f\"Events after dedup: {len(df):,}\")\n",
408
+ "else:\n",
409
+ " print(\"No duplicate events flagged by 01b - skipping deduplication\")"
410
+ ]
411
+ },
412
+ {
413
+ "cell_type": "markdown",
414
+ "id": "cell-6",
415
+ "metadata": {
416
+ "papermill": {
417
+ "duration": 0.002363,
418
+ "end_time": "2026-02-02T13:02:51.202038",
419
+ "exception": false,
420
+ "start_time": "2026-02-02T13:02:51.199675",
421
+ "status": "completed"
422
+ },
423
+ "tags": []
424
+ },
425
+ "source": [
426
+ "## 1d.2 Configure Aggregation Based on Findings\n",
427
+ "\n",
428
+ "Apply all insights from 01a-01c to configure optimal aggregation."
429
+ ]
430
+ },
431
+ {
432
+ "cell_type": "code",
433
+ "execution_count": null,
434
+ "id": "cell-7",
435
+ "metadata": {
436
+ "execution": {
437
+ "iopub.execute_input": "2026-02-02T13:02:51.208249Z",
438
+ "iopub.status.busy": "2026-02-02T13:02:51.208115Z",
439
+ "iopub.status.idle": "2026-02-02T13:02:51.215791Z",
440
+ "shell.execute_reply": "2026-02-02T13:02:51.215205Z"
441
+ },
442
+ "papermill": {
443
+ "duration": 0.011612,
444
+ "end_time": "2026-02-02T13:02:51.216382",
445
+ "exception": false,
446
+ "start_time": "2026-02-02T13:02:51.204770",
447
+ "status": "completed"
448
+ },
449
+ "tags": []
450
+ },
451
+ "outputs": [],
452
+ "source": [
453
+ "# === AGGREGATION CONFIGURATION ===\n",
454
+ "# Windows are loaded from findings (01a recommendations) with option to override\n",
455
+ "\n",
456
+ "# Manual override (set to None to use findings recommendations)\n",
457
+ "WINDOW_OVERRIDE = None # e.g., [\"7d\", \"30d\", \"90d\"] to override\n",
458
+ "\n",
459
+ "# Get windows from findings or use defaults\n",
460
+ "if WINDOW_OVERRIDE:\n",
461
+ " WINDOWS = WINDOW_OVERRIDE\n",
462
+ " window_source = \"manual override\"\n",
463
+ "elif ts_meta.suggested_aggregations:\n",
464
+ " WINDOWS = ts_meta.suggested_aggregations\n",
465
+ " window_source = \"01a recommendations\"\n",
466
+ "else:\n",
467
+ " WINDOWS = [\"7d\", \"30d\", \"90d\", \"180d\", \"365d\", \"all_time\"]\n",
468
+ " window_source = \"defaults (no findings)\"\n",
469
+ "\n",
470
+ "# Reference date for window calculations\n",
471
+ "REFERENCE_DATE = df[TIME_COLUMN].max()\n",
472
+ "\n",
473
+ "# Load all recommendations via AggregationFeatureConfig\n",
474
+ "agg_feature_config = AggregationFeatureConfig.from_findings(findings)\n",
475
+ "\n",
476
+ "# Extract pattern metadata for feature prioritization\n",
477
+ "pattern_meta = findings.metadata.get(\"temporal_patterns\", {})\n",
478
+ "velocity_meta = pattern_meta.get(\"velocity\", {})\n",
479
+ "momentum_meta = pattern_meta.get(\"momentum\", {})\n",
480
+ "\n",
481
+ "# Identify divergent columns (these are most predictive for target)\n",
482
+ "DIVERGENT_VELOCITY_COLS = [k for k, v in velocity_meta.items() \n",
483
+ " if isinstance(v, dict) and v.get(\"divergent\")]\n",
484
+ "DIVERGENT_MOMENTUM_COLS = momentum_meta.get(\"_divergent_columns\", [])\n",
485
+ "\n",
486
+ "# Value columns: prioritize divergent columns, then other numerics\n",
487
+ "# IMPORTANT: Exclude target column and temporal metadata to prevent data leakage!\n",
488
+ "TARGET_COLUMN = findings.target_column\n",
489
+ "numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()\n",
490
+ "exclude_cols = {ENTITY_COLUMN, TIME_COLUMN} | set(TEMPORAL_METADATA_COLS)\n",
491
+ "if TARGET_COLUMN:\n",
492
+ " exclude_cols.add(TARGET_COLUMN)\n",
493
+ "available_numeric = [c for c in numeric_cols if c not in exclude_cols]\n",
494
+ "\n",
495
+ "# Put divergent columns first (they showed predictive signal in 01c)\n",
496
+ "priority_cols = [c for c in DIVERGENT_VELOCITY_COLS + DIVERGENT_MOMENTUM_COLS \n",
497
+ " if c in available_numeric]\n",
498
+ "other_cols = [c for c in available_numeric if c not in priority_cols]\n",
499
+ "\n",
500
+ "# Include text PCA columns from findings if text processing was performed\n",
501
+ "text_pca_cols = [c for c in agg_feature_config.text_pca_columns if c in df.columns]\n",
502
+ "VALUE_COLUMNS = priority_cols + other_cols + text_pca_cols\n",
503
+ "\n",
504
+ "# Aggregation functions\n",
505
+ "AGG_FUNCTIONS = [\"sum\", \"mean\", \"max\", \"count\"]\n",
506
+ "\n",
507
+ "# Lifecycle features - read from 01c feature_flags, fallback to 01a/defaults\n",
508
+ "feature_flags = pattern_meta.get(\"feature_flags\", {})\n",
509
+ "INCLUDE_LIFECYCLE_QUADRANT = feature_flags.get(\n",
510
+ " \"include_lifecycle_quadrant\",\n",
511
+ " ts_meta.temporal_segmentation_recommendation is not None\n",
512
+ ")\n",
513
+ "INCLUDE_RECENCY = feature_flags.get(\"include_recency\", True)\n",
514
+ "INCLUDE_TENURE = feature_flags.get(\"include_tenure\", True)\n",
515
+ "\n",
516
+ "# Quality: check for duplicate events from 01b\n",
517
+ "DUPLICATE_EVENT_COUNT = get_duplicate_event_count(findings)\n",
518
+ "\n",
519
+ "# Momentum recommendations for ratio features\n",
520
+ "MOMENTUM_RECOMMENDATIONS = pattern_meta.get(\"momentum\", {}).get(\"recommendations\", [])\n",
521
+ "\n",
522
+ "# Print configuration\n",
523
+ "print(\"=\" * 70)\n",
524
+ "print(\"AGGREGATION CONFIGURATION\")\n",
525
+ "print(\"=\" * 70)\n",
526
+ "print(f\"\\nWindows: {WINDOWS}\")\n",
527
+ "print(f\" Source: {window_source}\")\n",
528
+ "print(f\"\\nReference date: {REFERENCE_DATE}\")\n",
529
+ "print(f\"\\nValue columns ({len(VALUE_COLUMNS)} total):\")\n",
530
+ "if priority_cols:\n",
531
+ " print(f\" Priority (divergent): {priority_cols}\")\n",
532
+ "print(f\" Other: {other_cols[:5]}{'...' if len(other_cols) > 5 else ''}\")\n",
533
+ "if text_pca_cols:\n",
534
+ " print(f\" Text PCA: {text_pca_cols}\")\n",
535
+ "if TARGET_COLUMN:\n",
536
+ " print(f\"\\n Excluded from aggregation: {TARGET_COLUMN} (target - prevents leakage)\")\n",
537
+ "print(f\"\\nAggregation functions: {AGG_FUNCTIONS}\")\n",
538
+ "print(f\"\\nAdditional features:\")\n",
539
+ "print(f\" Include lifecycle_quadrant: {INCLUDE_LIFECYCLE_QUADRANT}\")\n",
540
+ "print(f\" Include recency: {INCLUDE_RECENCY}\")\n",
541
+ "print(f\" Include tenure: {INCLUDE_TENURE}\")\n",
542
+ "if DUPLICATE_EVENT_COUNT > 0:\n",
543
+ " print(f\"\\n Duplicate events to remove: {DUPLICATE_EVENT_COUNT:,}\")\n",
544
+ "if MOMENTUM_RECOMMENDATIONS:\n",
545
+ " print(f\" Momentum ratio features: {len(MOMENTUM_RECOMMENDATIONS)} recommendation(s)\")\n",
546
+ "\n",
547
+ "# Print recommendation summary from 01c\n",
548
+ "print(\"\\n\" + agg_feature_config.format_recommendation_summary())"
549
+ ]
550
+ },
551
+ {
552
+ "cell_type": "markdown",
553
+ "id": "cell-8",
554
+ "metadata": {
555
+ "papermill": {
556
+ "duration": 0.00218,
557
+ "end_time": "2026-02-02T13:02:51.221276",
558
+ "exception": false,
559
+ "start_time": "2026-02-02T13:02:51.219096",
560
+ "status": "completed"
561
+ },
562
+ "tags": []
563
+ },
564
+ "source": [
565
+ "## 1d.3 Preview Aggregation Plan\n",
566
+ "\n",
567
+ "See what features will be created before executing."
568
+ ]
569
+ },
570
+ {
571
+ "cell_type": "code",
572
+ "execution_count": null,
573
+ "id": "cell-9",
574
+ "metadata": {
575
+ "execution": {
576
+ "iopub.execute_input": "2026-02-02T13:02:51.227202Z",
577
+ "iopub.status.busy": "2026-02-02T13:02:51.227065Z",
578
+ "iopub.status.idle": "2026-02-02T13:02:51.231186Z",
579
+ "shell.execute_reply": "2026-02-02T13:02:51.230508Z"
580
+ },
581
+ "papermill": {
582
+ "duration": 0.00844,
583
+ "end_time": "2026-02-02T13:02:51.232099",
584
+ "exception": false,
585
+ "start_time": "2026-02-02T13:02:51.223659",
586
+ "status": "completed"
587
+ },
588
+ "tags": []
589
+ },
590
+ "outputs": [],
591
+ "source": [
592
+ "# Initialize aggregator\n",
593
+ "aggregator = TimeWindowAggregator(\n",
594
+ " entity_column=ENTITY_COLUMN,\n",
595
+ " time_column=TIME_COLUMN\n",
596
+ ")\n",
597
+ "\n",
598
+ "# Generate plan\n",
599
+ "plan = aggregator.generate_plan(\n",
600
+ " df=df,\n",
601
+ " windows=WINDOWS,\n",
602
+ " value_columns=VALUE_COLUMNS,\n",
603
+ " agg_funcs=AGG_FUNCTIONS,\n",
604
+ " include_event_count=True,\n",
605
+ " include_recency=INCLUDE_RECENCY,\n",
606
+ " include_tenure=INCLUDE_TENURE\n",
607
+ ")\n",
608
+ "\n",
609
+ "# Count additional features we'll add\n",
610
+ "additional_features = []\n",
611
+ "if INCLUDE_LIFECYCLE_QUADRANT:\n",
612
+ " additional_features.append(\"lifecycle_quadrant\")\n",
613
+ "if findings.target_column and findings.target_column in df.columns:\n",
614
+ " additional_features.append(f\"{findings.target_column} (entity target)\")\n",
615
+ "\n",
616
+ "print(\"\\n\" + \"=\"*60)\n",
617
+ "print(\"AGGREGATION PLAN\")\n",
618
+ "print(\"=\"*60)\n",
619
+ "print(f\"\\nEntity column: {plan.entity_column}\")\n",
620
+ "print(f\"Time column: {plan.time_column}\")\n",
621
+ "print(f\"Windows: {[w.name for w in plan.windows]}\")\n",
622
+ "\n",
623
+ "print(f\"\\nFeatures from aggregation ({len(plan.feature_columns)}):\")\n",
624
+ "for feat in plan.feature_columns[:15]:\n",
625
+ " # Highlight divergent column features\n",
626
+ " is_priority = any(dc in feat for dc in priority_cols) if priority_cols else False\n",
627
+ " marker = \" 🎯\" if is_priority else \"\"\n",
628
+ " print(f\" - {feat}{marker}\")\n",
629
+ "if len(plan.feature_columns) > 15:\n",
630
+ " print(f\" ... and {len(plan.feature_columns) - 15} more\")\n",
631
+ "\n",
632
+ "if additional_features:\n",
633
+ " print(f\"\\nAdditional features:\")\n",
634
+ " for feat in additional_features:\n",
635
+ " print(f\" - {feat}\")\n",
636
+ " \n",
637
+ "print(f\"\\nTotal expected features: {len(plan.feature_columns) + len(additional_features) + 1}\")"
638
+ ]
639
+ },
640
+ {
641
+ "cell_type": "markdown",
642
+ "id": "cell-10",
643
+ "metadata": {
644
+ "papermill": {
645
+ "duration": 0.002665,
646
+ "end_time": "2026-02-02T13:02:51.237863",
647
+ "exception": false,
648
+ "start_time": "2026-02-02T13:02:51.235198",
649
+ "status": "completed"
650
+ },
651
+ "tags": []
652
+ },
653
+ "source": [
654
+ "## 1d.4 Execute Aggregation"
655
+ ]
656
+ },
657
+ {
658
+ "cell_type": "code",
659
+ "execution_count": null,
660
+ "id": "cell-11",
661
+ "metadata": {
662
+ "execution": {
663
+ "iopub.execute_input": "2026-02-02T13:02:51.243846Z",
664
+ "iopub.status.busy": "2026-02-02T13:02:51.243739Z",
665
+ "iopub.status.idle": "2026-02-02T13:02:53.156123Z",
666
+ "shell.execute_reply": "2026-02-02T13:02:53.153731Z"
667
+ },
668
+ "papermill": {
669
+ "duration": 1.916389,
670
+ "end_time": "2026-02-02T13:02:53.156860",
671
+ "exception": false,
672
+ "start_time": "2026-02-02T13:02:51.240471",
673
+ "status": "completed"
674
+ },
675
+ "tags": []
676
+ },
677
+ "outputs": [],
678
+ "source": [
679
+ "print(\"Executing aggregation...\")\n",
680
+ "print(f\" Input: {len(df):,} events\")\n",
681
+ "print(f\" Expected output: {df[ENTITY_COLUMN].nunique():,} entities\")\n",
682
+ "\n",
683
+ "# Step 1: Basic time window aggregation\n",
684
+ "df_aggregated = aggregator.aggregate(\n",
685
+ " df,\n",
686
+ " windows=WINDOWS,\n",
687
+ " value_columns=VALUE_COLUMNS,\n",
688
+ " agg_funcs=AGG_FUNCTIONS,\n",
689
+ " reference_date=REFERENCE_DATE,\n",
690
+ " include_event_count=True,\n",
691
+ " include_recency=INCLUDE_RECENCY,\n",
692
+ " include_tenure=INCLUDE_TENURE\n",
693
+ ")\n",
694
+ "\n",
695
+ "# Step 2: Add lifecycle quadrant (from 01a recommendation)\n",
696
+ "if INCLUDE_LIFECYCLE_QUADRANT:\n",
697
+ " print(\"\\n Adding lifecycle_quadrant feature...\")\n",
698
+ " profiler = TimeSeriesProfiler(entity_column=ENTITY_COLUMN, time_column=TIME_COLUMN)\n",
699
+ " ts_profile = profiler.profile(df)\n",
700
+ " \n",
701
+ " # Rename 'entity' column to match our entity column name\n",
702
+ " lifecycles = ts_profile.entity_lifecycles.copy()\n",
703
+ " lifecycles = lifecycles.rename(columns={\"entity\": ENTITY_COLUMN})\n",
704
+ " \n",
705
+ " quadrant_result = classify_lifecycle_quadrants(lifecycles)\n",
706
+ " \n",
707
+ " # Merge lifecycle_quadrant into aggregated data\n",
708
+ " quadrant_map = quadrant_result.lifecycles.set_index(ENTITY_COLUMN)[\"lifecycle_quadrant\"]\n",
709
+ " df_aggregated[\"lifecycle_quadrant\"] = df_aggregated[ENTITY_COLUMN].map(quadrant_map)\n",
710
+ " \n",
711
+ " print(f\" Quadrant distribution:\")\n",
712
+ " for quad, count in df_aggregated[\"lifecycle_quadrant\"].value_counts().items():\n",
713
+ " pct = count / len(df_aggregated) * 100\n",
714
+ " print(f\" {quad}: {count:,} ({pct:.1f}%)\")\n",
715
+ "\n",
716
+ "# Step 3: Add entity-level target (if available)\n",
717
+ "TARGET_COLUMN = findings.target_column\n",
718
+ "if TARGET_COLUMN and TARGET_COLUMN in df.columns:\n",
719
+ " print(f\"\\n Adding entity-level target ({TARGET_COLUMN})...\")\n",
720
+ " # For entity-level target, use max (if any event has target=1, entity has target=1)\n",
721
+ " entity_target = df.groupby(ENTITY_COLUMN)[TARGET_COLUMN].max()\n",
722
+ " df_aggregated[TARGET_COLUMN] = df_aggregated[ENTITY_COLUMN].map(entity_target)\n",
723
+ " \n",
724
+ " target_dist = df_aggregated[TARGET_COLUMN].value_counts()\n",
725
+ " for val, count in target_dist.items():\n",
726
+ " pct = count / len(df_aggregated) * 100\n",
727
+ " print(f\" {TARGET_COLUMN}={val}: {count:,} ({pct:.1f}%)\")\n",
728
+ "\n",
729
+ "# Step 4: Add cyclical features based on seasonality recommendations\n",
730
+ "if SEASONALITY_RECOMMENDATIONS:\n",
731
+ " cyclical_added = []\n",
732
+ " for rec in SEASONALITY_RECOMMENDATIONS:\n",
733
+ " if rec.get(\"action\") == \"add_cyclical_feature\":\n",
734
+ " feature = rec.get(\"feature\")\n",
735
+ " if feature == \"day_of_week\":\n",
736
+ " entity_dow = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(\n",
737
+ " lambda x: x.dt.dayofweek.mean()\n",
738
+ " )\n",
739
+ " df_aggregated[\"dow_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_dow) / 7)\n",
740
+ " df_aggregated[\"dow_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_dow) / 7)\n",
741
+ " cyclical_added.append(\"day_of_week (dow_sin, dow_cos)\")\n",
742
+ " elif feature == \"day_of_month\":\n",
743
+ " entity_dom = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(\n",
744
+ " lambda x: x.dt.day.mean()\n",
745
+ " )\n",
746
+ " df_aggregated[\"dom_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_dom) / 31)\n",
747
+ " df_aggregated[\"dom_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_dom) / 31)\n",
748
+ " cyclical_added.append(\"day_of_month (dom_sin, dom_cos)\")\n",
749
+ " elif feature == \"quarter\":\n",
750
+ " entity_quarter = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(\n",
751
+ " lambda x: x.dt.quarter.mean()\n",
752
+ " )\n",
753
+ " df_aggregated[\"quarter_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_quarter) / 4)\n",
754
+ " df_aggregated[\"quarter_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_quarter) / 4)\n",
755
+ " cyclical_added.append(\"quarter (quarter_sin, quarter_cos)\")\n",
756
+ " \n",
757
+ " if cyclical_added:\n",
758
+ " print(f\"\\n Adding cyclical features from seasonality analysis:\")\n",
759
+ " for feat in cyclical_added:\n",
760
+ " print(f\" -> {feat}\")\n",
761
+ "\n",
762
+ "# Step 5: Add cyclical features based on temporal pattern analysis (from grid)\n",
763
+ "if TEMPORAL_PATTERN_RECOMMENDATIONS:\n",
764
+ " tp_added = []\n",
765
+ " for rec in TEMPORAL_PATTERN_RECOMMENDATIONS:\n",
766
+ " features = rec.get(\"features\", [])\n",
767
+ " pattern = rec.get(\"pattern\", \"\")\n",
768
+ " \n",
769
+ " if pattern == \"day_of_week\" and \"dow_sin\" in df_aggregated.columns:\n",
770
+ " continue\n",
771
+ " if pattern == \"month\" and \"month_sin\" in df_aggregated.columns:\n",
772
+ " continue\n",
773
+ " if pattern == \"quarter\" and \"quarter_sin\" in df_aggregated.columns:\n",
774
+ " continue\n",
775
+ " \n",
776
+ " if \"dow_sin\" in features or \"dow_cos\" in features:\n",
777
+ " if \"dow_sin\" not in df_aggregated.columns:\n",
778
+ " entity_dow = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.dayofweek.mean())\n",
779
+ " df_aggregated[\"dow_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_dow) / 7)\n",
780
+ " df_aggregated[\"dow_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_dow) / 7)\n",
781
+ " tp_added.append(\"day_of_week (dow_sin, dow_cos)\")\n",
782
+ " \n",
783
+ " if \"is_weekend\" in features:\n",
784
+ " if \"is_weekend\" not in df_aggregated.columns:\n",
785
+ " entity_weekend_pct = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(\n",
786
+ " lambda x: (x.dt.dayofweek >= 5).mean()\n",
787
+ " )\n",
788
+ " df_aggregated[\"is_weekend_pct\"] = df_aggregated[ENTITY_COLUMN].map(entity_weekend_pct)\n",
789
+ " tp_added.append(\"is_weekend_pct\")\n",
790
+ " \n",
791
+ " if \"month_sin\" in features or \"month_cos\" in features:\n",
792
+ " if \"month_sin\" not in df_aggregated.columns:\n",
793
+ " entity_month = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.month.mean())\n",
794
+ " df_aggregated[\"month_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_month) / 12)\n",
795
+ " df_aggregated[\"month_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_month) / 12)\n",
796
+ " tp_added.append(\"month (month_sin, month_cos)\")\n",
797
+ " \n",
798
+ " if \"quarter_sin\" in features or \"quarter_cos\" in features:\n",
799
+ " if \"quarter_sin\" not in df_aggregated.columns:\n",
800
+ " entity_quarter = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.quarter.mean())\n",
801
+ " df_aggregated[\"quarter_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_quarter) / 4)\n",
802
+ " df_aggregated[\"quarter_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_quarter) / 4)\n",
803
+ " tp_added.append(\"quarter (quarter_sin, quarter_cos)\")\n",
804
+ " \n",
805
+ " if \"year_trend\" in features:\n",
806
+ " if \"year_trend\" not in df_aggregated.columns:\n",
807
+ " entity_year = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.year.mean())\n",
808
+ " min_year = entity_year.min()\n",
809
+ " df_aggregated[\"year_trend\"] = df_aggregated[ENTITY_COLUMN].map(entity_year) - min_year\n",
810
+ " tp_added.append(f\"year_trend (normalized from {min_year:.0f})\")\n",
811
+ " \n",
812
+ " if \"year_categorical\" in features:\n",
813
+ " if \"year_mode\" not in df_aggregated.columns:\n",
814
+ " entity_year_mode = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(\n",
815
+ " lambda x: x.dt.year.mode().iloc[0] if len(x.dt.year.mode()) > 0 else x.dt.year.median()\n",
816
+ " )\n",
817
+ " df_aggregated[\"year_mode\"] = df_aggregated[ENTITY_COLUMN].map(entity_year_mode).astype(int)\n",
818
+ " tp_added.append(\"year_mode (categorical - encode before modeling)\")\n",
819
+ " \n",
820
+ " if tp_added:\n",
821
+ " print(f\"\\n Adding features from temporal pattern analysis:\")\n",
822
+ " for feat in tp_added:\n",
823
+ " print(f\" -> {feat}\")\n",
824
+ "\n",
825
+ "# Step 6: Add trend features based on trend recommendations\n",
826
+ "if TREND_RECOMMENDATIONS:\n",
827
+ " trend_added = []\n",
828
+ " for rec in TREND_RECOMMENDATIONS:\n",
829
+ " features = rec.get(\"features\", [])\n",
830
+ " \n",
831
+ " if \"recent_vs_overall_ratio\" in features:\n",
832
+ " if \"recent_vs_overall_ratio\" not in df_aggregated.columns:\n",
833
+ " time_span = (df[TIME_COLUMN].max() - df[TIME_COLUMN].min()).days\n",
834
+ " recent_cutoff = df[TIME_COLUMN].max() - pd.Timedelta(days=int(time_span * 0.3))\n",
835
+ " \n",
836
+ " overall_counts = df.groupby(ENTITY_COLUMN).size()\n",
837
+ " recent_counts = df[df[TIME_COLUMN] >= recent_cutoff].groupby(ENTITY_COLUMN).size()\n",
838
+ " \n",
839
+ " ratio = recent_counts / overall_counts\n",
840
+ " ratio = ratio.fillna(0)\n",
841
+ " df_aggregated[\"recent_vs_overall_ratio\"] = df_aggregated[ENTITY_COLUMN].map(ratio).fillna(0)\n",
842
+ " trend_added.append(\"recent_vs_overall_ratio\")\n",
843
+ " \n",
844
+ " if \"entity_trend_slope\" in features:\n",
845
+ " if \"entity_trend_slope\" not in df_aggregated.columns:\n",
846
+ " def compute_entity_slope(group):\n",
847
+ " if len(group) < 3:\n",
848
+ " return 0.0\n",
849
+ " x = (group[TIME_COLUMN] - group[TIME_COLUMN].min()).dt.days.values\n",
850
+ " y = np.arange(len(group))\n",
851
+ " if x.std() == 0:\n",
852
+ " return 0.0\n",
853
+ " slope = np.polyfit(x, y, 1)[0]\n",
854
+ " return slope\n",
855
+ " \n",
856
+ " entity_slopes = df.groupby(ENTITY_COLUMN).apply(compute_entity_slope)\n",
857
+ " df_aggregated[\"entity_trend_slope\"] = df_aggregated[ENTITY_COLUMN].map(entity_slopes).fillna(0)\n",
858
+ " trend_added.append(\"entity_trend_slope\")\n",
859
+ " \n",
860
+ " if trend_added:\n",
861
+ " print(f\"\\n Adding features from trend analysis:\")\n",
862
+ " for feat in trend_added:\n",
863
+ " print(f\" -> {feat}\")\n",
864
+ "\n",
865
+ "# Step 7: Add cohort features based on cohort recommendations\n",
866
+ "if COHORT_RECOMMENDATIONS:\n",
867
+ " skip_cohort = any(r.get(\"action\") == \"skip_cohort_features\" for r in COHORT_RECOMMENDATIONS)\n",
868
+ " if not skip_cohort:\n",
869
+ " cohort_added = []\n",
870
+ " cohort_features = [f for r in COHORT_RECOMMENDATIONS for f in r.get(\"features\", [])]\n",
871
+ " \n",
872
+ " if \"cohort_year\" in cohort_features or \"cohort_quarter\" in cohort_features:\n",
873
+ " entity_first = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].min()\n",
874
+ " \n",
875
+ " if \"cohort_year\" in cohort_features and \"cohort_year\" not in df_aggregated.columns:\n",
876
+ " df_aggregated[\"cohort_year\"] = df_aggregated[ENTITY_COLUMN].map(entity_first).dt.year\n",
877
+ " cohort_added.append(\"cohort_year\")\n",
878
+ " \n",
879
+ " if \"cohort_quarter\" in cohort_features and \"cohort_quarter\" not in df_aggregated.columns:\n",
880
+ " first_dates = df_aggregated[ENTITY_COLUMN].map(entity_first)\n",
881
+ " df_aggregated[\"cohort_quarter\"] = first_dates.dt.year.astype(str) + \"Q\" + first_dates.dt.quarter.astype(str)\n",
882
+ " cohort_added.append(\"cohort_quarter\")\n",
883
+ " \n",
884
+ " if cohort_added:\n",
885
+ " print(f\"\\n Adding cohort features:\")\n",
886
+ " for feat in cohort_added:\n",
887
+ " print(f\" -> {feat}\")\n",
888
+ " else:\n",
889
+ " print(f\"\\n Skipping cohort features (insufficient variation)\")\n",
890
+ "\n",
891
+ "# Step 8: Add momentum ratio features from 01c momentum recommendations\n",
892
+ "if MOMENTUM_RECOMMENDATIONS:\n",
893
+ " before_cols = set(df_aggregated.columns)\n",
894
+ " df_aggregated = create_momentum_ratio_features(df_aggregated, MOMENTUM_RECOMMENDATIONS)\n",
895
+ " new_momentum_cols = set(df_aggregated.columns) - before_cols\n",
896
+ " if new_momentum_cols:\n",
897
+ " print(f\"\\n Adding momentum ratio features:\")\n",
898
+ " for feat in sorted(new_momentum_cols):\n",
899
+ " print(f\" -> {feat}\")\n",
900
+ " else:\n",
901
+ " print(f\"\\n Momentum ratio features: columns not available in aggregated data (skipped)\")\n",
902
+ "\n",
903
+ "# Step 9: Add recency bucket feature\n",
904
+ "if INCLUDE_RECENCY and \"days_since_last_event\" in df_aggregated.columns:\n",
905
+ " df_aggregated = create_recency_bucket_feature(df_aggregated)\n",
906
+ " if \"recency_bucket\" in df_aggregated.columns:\n",
907
+ " print(f\"\\n Adding recency_bucket feature:\")\n",
908
+ " for bucket, count in df_aggregated[\"recency_bucket\"].value_counts().sort_index().items():\n",
909
+ " pct = count / len(df_aggregated) * 100\n",
910
+ " print(f\" {bucket}: {count:,} ({pct:.1f}%)\")\n",
911
+ "\n",
912
+ "print(f\"\\n Aggregation complete!\")\n",
913
+ "print(f\" Output: {len(df_aggregated):,} entities x {len(df_aggregated.columns)} features\")\n",
914
+ "print(f\" Memory: {df_aggregated.memory_usage(deep=True).sum() / 1024**2:.1f} MB\")"
915
+ ]
916
+ },
917
+ {
918
+ "cell_type": "code",
919
+ "execution_count": null,
920
+ "id": "cell-12",
921
+ "metadata": {
922
+ "execution": {
923
+ "iopub.execute_input": "2026-02-02T13:02:53.166590Z",
924
+ "iopub.status.busy": "2026-02-02T13:02:53.166444Z",
925
+ "iopub.status.idle": "2026-02-02T13:02:53.190577Z",
926
+ "shell.execute_reply": "2026-02-02T13:02:53.178724Z"
927
+ },
928
+ "papermill": {
929
+ "duration": 0.032503,
930
+ "end_time": "2026-02-02T13:02:53.192173",
931
+ "exception": false,
932
+ "start_time": "2026-02-02T13:02:53.159670",
933
+ "status": "completed"
934
+ },
935
+ "tags": []
936
+ },
937
+ "outputs": [],
938
+ "source": [
939
+ "# Preview aggregated data\n",
940
+ "print(\"\\nAggregated Data Preview:\")\n",
941
+ "display(df_aggregated.head(10))"
942
+ ]
943
+ },
944
+ {
945
+ "cell_type": "code",
946
+ "execution_count": null,
947
+ "id": "cell-13",
948
+ "metadata": {
949
+ "execution": {
950
+ "iopub.execute_input": "2026-02-02T13:02:53.199125Z",
951
+ "iopub.status.busy": "2026-02-02T13:02:53.198888Z",
952
+ "iopub.status.idle": "2026-02-02T13:02:53.239063Z",
953
+ "shell.execute_reply": "2026-02-02T13:02:53.238602Z"
954
+ },
955
+ "papermill": {
956
+ "duration": 0.044343,
957
+ "end_time": "2026-02-02T13:02:53.239638",
958
+ "exception": false,
959
+ "start_time": "2026-02-02T13:02:53.195295",
960
+ "status": "completed"
961
+ },
962
+ "tags": []
963
+ },
964
+ "outputs": [],
965
+ "source": [
966
+ "# Summary statistics\n",
967
+ "print(\"\\nFeature Summary Statistics:\")\n",
968
+ "display(df_aggregated.describe().T)"
969
+ ]
970
+ },
971
+ {
972
+ "cell_type": "markdown",
973
+ "id": "cell-14",
974
+ "metadata": {
975
+ "papermill": {
976
+ "duration": 0.002492,
977
+ "end_time": "2026-02-02T13:02:53.245201",
978
+ "exception": false,
979
+ "start_time": "2026-02-02T13:02:53.242709",
980
+ "status": "completed"
981
+ },
982
+ "tags": []
983
+ },
984
+ "source": [
985
+ "## 1d.5 Quality Check on Aggregated Data\n",
986
+ "\n",
987
+ "Quick validation of the aggregated output."
988
+ ]
989
+ },
990
+ {
991
+ "cell_type": "code",
992
+ "execution_count": null,
993
+ "id": "cell-15",
994
+ "metadata": {
995
+ "execution": {
996
+ "iopub.execute_input": "2026-02-02T13:02:53.251846Z",
997
+ "iopub.status.busy": "2026-02-02T13:02:53.251720Z",
998
+ "iopub.status.idle": "2026-02-02T13:02:53.268166Z",
999
+ "shell.execute_reply": "2026-02-02T13:02:53.267760Z"
1000
+ },
1001
+ "papermill": {
1002
+ "duration": 0.021386,
1003
+ "end_time": "2026-02-02T13:02:53.269656",
1004
+ "exception": false,
1005
+ "start_time": "2026-02-02T13:02:53.248270",
1006
+ "status": "completed"
1007
+ },
1008
+ "tags": []
1009
+ },
1010
+ "outputs": [],
1011
+ "source": [
1012
+ "print(\"=\"*60)\n",
1013
+ "print(\"AGGREGATED DATA QUALITY CHECK\")\n",
1014
+ "print(\"=\"*60)\n",
1015
+ "\n",
1016
+ "# Check for nulls\n",
1017
+ "null_counts = df_aggregated.isnull().sum()\n",
1018
+ "cols_with_nulls = null_counts[null_counts > 0]\n",
1019
+ "\n",
1020
+ "if len(cols_with_nulls) > 0:\n",
1021
+ " print(f\"\\n⚠️ Columns with null values ({len(cols_with_nulls)}):\")\n",
1022
+ " for col, count in cols_with_nulls.head(10).items():\n",
1023
+ " pct = count / len(df_aggregated) * 100\n",
1024
+ " print(f\" {col}: {count:,} ({pct:.1f}%)\")\n",
1025
+ " if len(cols_with_nulls) > 10:\n",
1026
+ " print(f\" ... and {len(cols_with_nulls) - 10} more\")\n",
1027
+ " print(\"\\n Note: Nulls in aggregated features typically mean no events in that window.\")\n",
1028
+ " print(\" Consider filling with 0 for count/sum features.\")\n",
1029
+ "else:\n",
1030
+ " print(\"\\n✅ No null values in aggregated data\")\n",
1031
+ "\n",
1032
+ "# Check entity count matches\n",
1033
+ "original_entities = df[ENTITY_COLUMN].nunique()\n",
1034
+ "aggregated_entities = len(df_aggregated)\n",
1035
+ "\n",
1036
+ "if original_entities == aggregated_entities:\n",
1037
+ " print(f\"\\n✅ Entity count matches: {aggregated_entities:,}\")\n",
1038
+ "else:\n",
1039
+ " print(f\"\\n⚠️ Entity count mismatch!\")\n",
1040
+ " print(f\" Original: {original_entities:,}\")\n",
1041
+ " print(f\" Aggregated: {aggregated_entities:,}\")\n",
1042
+ "\n",
1043
+ "# Check feature statistics\n",
1044
+ "print(f\"\\n📊 Feature Statistics:\")\n",
1045
+ "numeric_agg_cols = df_aggregated.select_dtypes(include=[np.number]).columns.tolist()\n",
1046
+ "if TARGET_COLUMN:\n",
1047
+ " numeric_agg_cols = [c for c in numeric_agg_cols if c != TARGET_COLUMN]\n",
1048
+ "\n",
1049
+ "print(f\" Total features: {len(df_aggregated.columns)}\")\n",
1050
+ "print(f\" Numeric features: {len(numeric_agg_cols)}\")\n",
1051
+ "\n",
1052
+ "# Check for constant columns (no variance)\n",
1053
+ "const_cols = [c for c in numeric_agg_cols if df_aggregated[c].std() == 0]\n",
1054
+ "if const_cols:\n",
1055
+ " print(f\"\\n⚠️ Constant columns (zero variance): {len(const_cols)}\")\n",
1056
+ " print(f\" {const_cols[:5]}{'...' if len(const_cols) > 5 else ''}\")\n",
1057
+ "\n",
1058
+ "# If lifecycle_quadrant was added, show its correlation with target\n",
1059
+ "if INCLUDE_LIFECYCLE_QUADRANT and TARGET_COLUMN and TARGET_COLUMN in df_aggregated.columns:\n",
1060
+ " print(f\"\\n📊 Lifecycle Quadrant vs Target:\")\n",
1061
+ " cross = pd.crosstab(df_aggregated[\"lifecycle_quadrant\"], df_aggregated[TARGET_COLUMN], normalize='index')\n",
1062
+ " if 1 in cross.columns:\n",
1063
+ " for quad in cross.index:\n",
1064
+ " rate = cross.loc[quad, 1] * 100\n",
1065
+ " print(f\" {quad}: {rate:.1f}% positive\")"
1066
+ ]
1067
+ },
1068
+ {
1069
+ "cell_type": "markdown",
1070
+ "id": "cell-16",
1071
+ "metadata": {
1072
+ "papermill": {
1073
+ "duration": 0.002985,
1074
+ "end_time": "2026-02-02T13:02:53.276083",
1075
+ "exception": false,
1076
+ "start_time": "2026-02-02T13:02:53.273098",
1077
+ "status": "completed"
1078
+ },
1079
+ "tags": []
1080
+ },
1081
+ "source": [
1082
+ "## 1d.6 Save Aggregated Data and Findings"
1083
+ ]
1084
+ },
1085
+ {
1086
+ "cell_type": "code",
1087
+ "execution_count": null,
1088
+ "id": "cell-17",
1089
+ "metadata": {
1090
+ "execution": {
1091
+ "iopub.execute_input": "2026-02-02T13:02:53.282170Z",
1092
+ "iopub.status.busy": "2026-02-02T13:02:53.282046Z",
1093
+ "iopub.status.idle": "2026-02-02T13:02:53.298215Z",
1094
+ "shell.execute_reply": "2026-02-02T13:02:53.297592Z"
1095
+ },
1096
+ "papermill": {
1097
+ "duration": 0.02009,
1098
+ "end_time": "2026-02-02T13:02:53.298926",
1099
+ "exception": false,
1100
+ "start_time": "2026-02-02T13:02:53.278836",
1101
+ "status": "completed"
1102
+ },
1103
+ "tags": []
1104
+ },
1105
+ "outputs": [],
1106
+ "source": [
1107
+ "# Generate output paths\n",
1108
+ "original_name = Path(findings.source_path).stem\n",
1109
+ "findings_name = Path(FINDINGS_PATH).stem.replace(\"_findings\", \"\")\n",
1110
+ "\n",
1111
+ "# Save aggregated data as parquet\n",
1112
+ "AGGREGATED_DATA_PATH = FINDINGS_DIR / f\"{findings_name}_aggregated.parquet\"\n",
1113
+ "df_aggregated.to_parquet(AGGREGATED_DATA_PATH, index=False)\n",
1114
+ "\n",
1115
+ "print(f\"\\u2705 Aggregated data saved to: {AGGREGATED_DATA_PATH}\")\n",
1116
+ "print(f\" Size: {AGGREGATED_DATA_PATH.stat().st_size / 1024:.1f} KB\")"
1117
+ ]
1118
+ },
1119
+ {
1120
+ "cell_type": "code",
1121
+ "execution_count": null,
1122
+ "id": "cell-18",
1123
+ "metadata": {
1124
+ "execution": {
1125
+ "iopub.execute_input": "2026-02-02T13:02:53.305737Z",
1126
+ "iopub.status.busy": "2026-02-02T13:02:53.305621Z",
1127
+ "iopub.status.idle": "2026-02-02T13:02:53.516684Z",
1128
+ "shell.execute_reply": "2026-02-02T13:02:53.504821Z"
1129
+ },
1130
+ "papermill": {
1131
+ "duration": 0.220598,
1132
+ "end_time": "2026-02-02T13:02:53.522820",
1133
+ "exception": false,
1134
+ "start_time": "2026-02-02T13:02:53.302222",
1135
+ "status": "completed"
1136
+ },
1137
+ "tags": []
1138
+ },
1139
+ "outputs": [],
1140
+ "source": [
1141
+ "# Create new findings for aggregated data using DataExplorer\n",
1142
+ "print(\"\\nGenerating findings for aggregated data...\")\n",
1143
+ "\n",
1144
+ "explorer = DataExplorer(output_dir=str(FINDINGS_DIR))\n",
1145
+ "aggregated_findings = explorer.explore(\n",
1146
+ " str(AGGREGATED_DATA_PATH),\n",
1147
+ " name=f\"{findings_name}_aggregated\"\n",
1148
+ ")\n",
1149
+ "\n",
1150
+ "AGGREGATED_FINDINGS_PATH = explorer.last_findings_path\n",
1151
+ "print(f\"✅ Aggregated findings saved to: {AGGREGATED_FINDINGS_PATH}\")"
1152
+ ]
1153
+ },
1154
+ {
1155
+ "cell_type": "code",
1156
+ "execution_count": null,
1157
+ "id": "cell-19",
1158
+ "metadata": {
1159
+ "execution": {
1160
+ "iopub.execute_input": "2026-02-02T13:02:53.532471Z",
1161
+ "iopub.status.busy": "2026-02-02T13:02:53.532330Z",
1162
+ "iopub.status.idle": "2026-02-02T13:02:54.958297Z",
1163
+ "shell.execute_reply": "2026-02-02T13:02:54.957669Z"
1164
+ },
1165
+ "papermill": {
1166
+ "duration": 1.431931,
1167
+ "end_time": "2026-02-02T13:02:54.959093",
1168
+ "exception": false,
1169
+ "start_time": "2026-02-02T13:02:53.527162",
1170
+ "status": "completed"
1171
+ },
1172
+ "tags": []
1173
+ },
1174
+ "outputs": [],
1175
+ "source": [
1176
+ "# Update original findings with comprehensive aggregation metadata\n",
1177
+ "findings.time_series_metadata.aggregation_executed = True\n",
1178
+ "findings.time_series_metadata.aggregated_data_path = str(AGGREGATED_DATA_PATH)\n",
1179
+ "findings.time_series_metadata.aggregated_findings_path = str(AGGREGATED_FINDINGS_PATH)\n",
1180
+ "findings.time_series_metadata.aggregation_windows_used = WINDOWS\n",
1181
+ "findings.time_series_metadata.aggregation_timestamp = datetime.now().isoformat()\n",
1182
+ "\n",
1183
+ "# Add aggregation details to metadata\n",
1184
+ "findings.metadata[\"aggregation\"] = {\n",
1185
+ " \"windows_used\": WINDOWS,\n",
1186
+ " \"window_source\": window_source,\n",
1187
+ " \"reference_date\": str(REFERENCE_DATE),\n",
1188
+ " \"value_columns_count\": len(VALUE_COLUMNS),\n",
1189
+ " \"priority_columns\": priority_cols, # Divergent columns from 01c\n",
1190
+ " \"agg_functions\": AGG_FUNCTIONS,\n",
1191
+ " \"include_lifecycle_quadrant\": INCLUDE_LIFECYCLE_QUADRANT,\n",
1192
+ " \"include_recency\": INCLUDE_RECENCY,\n",
1193
+ " \"include_tenure\": INCLUDE_TENURE,\n",
1194
+ " \"output_entities\": len(df_aggregated),\n",
1195
+ " \"output_features\": len(df_aggregated.columns),\n",
1196
+ " \"target_column\": TARGET_COLUMN,\n",
1197
+ "}\n",
1198
+ "\n",
1199
+ "findings.save(FINDINGS_PATH)\n",
1200
+ "print(f\"✅ Original findings updated with aggregation metadata: {FINDINGS_PATH}\")\n",
1201
+ "\n",
1202
+ "from customer_retention.analysis.notebook_html_exporter import export_notebook_html\n",
1203
+ "export_notebook_html(Path(\"01d_event_aggregation.ipynb\"), EXPERIMENTS_DIR / \"docs\")\n"
1204
+ ]
1205
+ },
1206
+ {
1207
+ "cell_type": "code",
1208
+ "execution_count": null,
1209
+ "id": "cell-20",
1210
+ "metadata": {
1211
+ "execution": {
1212
+ "iopub.execute_input": "2026-02-02T13:02:54.965848Z",
1213
+ "iopub.status.busy": "2026-02-02T13:02:54.965720Z",
1214
+ "iopub.status.idle": "2026-02-02T13:02:54.970625Z",
1215
+ "shell.execute_reply": "2026-02-02T13:02:54.970103Z"
1216
+ },
1217
+ "papermill": {
1218
+ "duration": 0.009208,
1219
+ "end_time": "2026-02-02T13:02:54.971347",
1220
+ "exception": false,
1221
+ "start_time": "2026-02-02T13:02:54.962139",
1222
+ "status": "completed"
1223
+ },
1224
+ "tags": []
1225
+ },
1226
+ "outputs": [],
1227
+ "source": [
1228
+ "# Summary of outputs\n",
1229
+ "print(\"\\n\" + \"=\"*70)\n",
1230
+ "print(\"AGGREGATION COMPLETE - OUTPUT SUMMARY\")\n",
1231
+ "print(\"=\"*70)\n",
1232
+ "\n",
1233
+ "print(f\"\\n📁 Files created:\")\n",
1234
+ "print(f\" 1. Aggregated data: {AGGREGATED_DATA_PATH}\")\n",
1235
+ "print(f\" 2. Aggregated findings: {AGGREGATED_FINDINGS_PATH}\")\n",
1236
+ "print(f\" 3. Updated original findings: {FINDINGS_PATH}\")\n",
1237
+ "\n",
1238
+ "print(f\"\\n📊 Transformation stats:\")\n",
1239
+ "print(f\" Input events: {len(df):,}\")\n",
1240
+ "print(f\" Output entities: {len(df_aggregated):,}\")\n",
1241
+ "print(f\" Features created: {len(df_aggregated.columns)}\")\n",
1242
+ "\n",
1243
+ "print(f\"\\n⚙️ Configuration applied:\")\n",
1244
+ "print(f\" Windows: {WINDOWS} (from {window_source})\")\n",
1245
+ "print(f\" Aggregation functions: {AGG_FUNCTIONS}\")\n",
1246
+ "if priority_cols:\n",
1247
+ " print(f\" Priority columns (from 01c divergence): {priority_cols}\")\n",
1248
+ "if INCLUDE_LIFECYCLE_QUADRANT:\n",
1249
+ " print(f\" Lifecycle quadrant: included (from 01a recommendation)\")\n",
1250
+ "\n",
1251
+ "print(f\"\\n🎯 Ready for modeling:\")\n",
1252
+ "print(f\" Entity column: {ENTITY_COLUMN}\")\n",
1253
+ "if TARGET_COLUMN:\n",
1254
+ " print(f\" Target column: {TARGET_COLUMN}\")\n",
1255
+ " if TARGET_COLUMN in df_aggregated.columns:\n",
1256
+ " positive_rate = df_aggregated[TARGET_COLUMN].mean() * 100\n",
1257
+ " print(f\" Target positive rate: {positive_rate:.1f}%\")\n",
1258
+ "\n",
1259
+ "# Drift warning if applicable\n",
1260
+ "if ts_meta.drift_risk_level == \"high\":\n",
1261
+ " print(f\"\\n⚠️ DRIFT WARNING: High drift risk detected in 01a\")\n",
1262
+ " print(f\" Volume drift: {ts_meta.volume_drift_risk or 'unknown'}\")\n",
1263
+ " print(f\" Consider: temporal validation splits, monitoring for distribution shift\")"
1264
+ ]
1265
+ },
1266
+ {
1267
+ "cell_type": "markdown",
1268
+ "id": "f616cd5a",
1269
+ "metadata": {
1270
+ "papermill": {
1271
+ "duration": 0.002909,
1272
+ "end_time": "2026-02-02T13:02:54.977579",
1273
+ "exception": false,
1274
+ "start_time": "2026-02-02T13:02:54.974670",
1275
+ "status": "completed"
1276
+ },
1277
+ "tags": []
1278
+ },
1279
+ "source": [
1280
+ "## 1d.X Leakage Validation\n",
1281
+ "\n",
1282
+ "**CRITICAL CHECK:** Verify no target leakage in aggregated features before proceeding.\n",
1283
+ "\n",
1284
+ "| Check | What It Detects | Severity |\n",
1285
+ "|-------|-----------------|----------|\n",
1286
+ "| LD052 | Target column or target-derived features in feature matrix | CRITICAL |\n",
1287
+ "| LD053 | Domain patterns (churn/cancel/retain) with high correlation | CRITICAL |\n",
1288
+ "| LD001-003 | Suspiciously high feature-target correlations | HIGH |\n",
1289
+ "\n",
1290
+ "**If any CRITICAL issues are detected, do NOT proceed to modeling.**"
1291
+ ]
1292
+ },
1293
+ {
1294
+ "cell_type": "code",
1295
+ "execution_count": null,
1296
+ "id": "1319267b",
1297
+ "metadata": {
1298
+ "execution": {
1299
+ "iopub.execute_input": "2026-02-02T13:02:54.984304Z",
1300
+ "iopub.status.busy": "2026-02-02T13:02:54.984185Z",
1301
+ "iopub.status.idle": "2026-02-02T13:02:55.847864Z",
1302
+ "shell.execute_reply": "2026-02-02T13:02:55.847349Z"
1303
+ },
1304
+ "papermill": {
1305
+ "duration": 0.867799,
1306
+ "end_time": "2026-02-02T13:02:55.848565",
1307
+ "exception": false,
1308
+ "start_time": "2026-02-02T13:02:54.980766",
1309
+ "status": "completed"
1310
+ },
1311
+ "tags": []
1312
+ },
1313
+ "outputs": [],
1314
+ "source": [
1315
+ "# Leakage validation - MUST pass before proceeding to modeling\n",
1316
+ "from customer_retention.analysis.diagnostics import LeakageDetector\n",
1317
+ "\n",
1318
+ "if TARGET_COLUMN and TARGET_COLUMN in df_aggregated.columns:\n",
1319
+ " detector = LeakageDetector()\n",
1320
+ " \n",
1321
+ " # Separate features and target\n",
1322
+ " feature_cols = [c for c in df_aggregated.columns if c not in [ENTITY_COLUMN, TARGET_COLUMN]]\n",
1323
+ " X = df_aggregated[feature_cols]\n",
1324
+ " y = df_aggregated[TARGET_COLUMN]\n",
1325
+ " \n",
1326
+ " # Run leakage checks\n",
1327
+ " result = detector.run_all_checks(X, y, include_pit=False)\n",
1328
+ " \n",
1329
+ " print(\"=\" * 70)\n",
1330
+ " print(\"LEAKAGE VALIDATION RESULTS\")\n",
1331
+ " print(\"=\" * 70)\n",
1332
+ " \n",
1333
+ " if result.passed:\n",
1334
+ " print(\"\\n✅ PASSED: No critical leakage issues detected\")\n",
1335
+ " print(f\" Total checks run: {len(result.checks)}\")\n",
1336
+ " print(\"\\n You may proceed to feature engineering and modeling.\")\n",
1337
+ " else:\n",
1338
+ " print(\"\\n❌ FAILED: Critical leakage issues detected!\")\n",
1339
+ " print(f\" Critical issues: {len(result.critical_issues)}\")\n",
1340
+ " print(\"\\n DO NOT proceed to modeling until issues are resolved:\\n\")\n",
1341
+ " for issue in result.critical_issues:\n",
1342
+ " print(f\" [{issue.check_id}] {issue.feature}: {issue.recommendation}\")\n",
1343
+ " print(\"\\n\" + \"=\" * 70)\n",
1344
+ " raise ValueError(f\"Leakage detected: {len(result.critical_issues)} critical issues\")\n",
1345
+ "else:\n",
1346
+ " print(\"No target column - skipping leakage validation\")"
1347
+ ]
1348
+ },
1349
+ {
1350
+ "cell_type": "markdown",
1351
+ "id": "cell-21",
1352
+ "metadata": {
1353
+ "papermill": {
1354
+ "duration": 0.002886,
1355
+ "end_time": "2026-02-02T13:02:55.854862",
1356
+ "exception": false,
1357
+ "start_time": "2026-02-02T13:02:55.851976",
1358
+ "status": "completed"
1359
+ },
1360
+ "tags": []
1361
+ },
1362
+ "source": [
1363
+ "---\n",
1364
+ "\n",
1365
+ "## Summary: What We Did\n",
1366
+ "\n",
1367
+ "In this notebook, we transformed event-level data to entity-level, applying all insights from 01a-01c:\n",
1368
+ "\n",
1369
+ "1. **Loaded findings** from prior notebooks (windows, patterns, quality)\n",
1370
+ "2. **Configured aggregation** using recommended windows from 01a\n",
1371
+ "3. **Prioritized features** based on divergent columns from 01c velocity/momentum analysis\n",
1372
+ "4. **Added lifecycle_quadrant** as recommended by 01a segmentation analysis\n",
1373
+ "5. **Added entity-level target** for downstream modeling\n",
1374
+ "6. **Saved outputs** - aggregated data, findings, and metadata\n",
1375
+ "\n",
1376
+ "## How Findings Were Applied\n",
1377
+ "\n",
1378
+ "| Finding | Source | Application |\n",
1379
+ "|---------|--------|-------------|\n",
1380
+ "| Aggregation windows | 01a | Used `suggested_aggregations` instead of defaults |\n",
1381
+ "| Lifecycle quadrant | 01a | Added as categorical feature for model |\n",
1382
+ "| Divergent columns | 01c | Prioritized in feature list (velocity/momentum signal) |\n",
1383
+ "| Drift warning | 01a | Flagged for temporal validation consideration |\n",
1384
+ "\n",
1385
+ "## Output Files\n",
1386
+ "\n",
1387
+ "| File | Purpose | Next Use |\n",
1388
+ "|------|---------|----------|\n",
1389
+ "| `*_aggregated.parquet` | Entity-level data with temporal features | Input for notebooks 02-04 |\n",
1390
+ "| `*_aggregated_findings.yaml` | Auto-profiled findings | Loaded by 02_column_deep_dive |\n",
1391
+ "| Original findings (updated) | Aggregation tracking | Reference and lineage |\n",
1392
+ "\n",
1393
+ "---\n",
1394
+ "\n",
1395
+ "## Next Steps\n",
1396
+ "\n",
1397
+ "**Event Bronze Track complete!** Continue with the **Entity Bronze Track** on the aggregated data:\n",
1398
+ "\n",
1399
+ "1. **02_column_deep_dive.ipynb** - Profile the aggregated feature distributions\n",
1400
+ "2. **03_quality_assessment.ipynb** - Run quality checks on entity-level data \n",
1401
+ "3. **04_relationship_analysis.ipynb** - Analyze feature correlations and target relationships\n",
1402
+ "\n",
1403
+ "The notebooks will auto-discover the aggregated findings file (most recently modified).\n",
1404
+ "\n",
1405
+ "```python\n",
1406
+ "# The aggregated findings file is now the most recent, so notebooks 02-04\n",
1407
+ "# will automatically use it via the standard discovery pattern.\n",
1408
+ "```"
1409
+ ]
1410
+ },
1411
+ {
1412
+ "cell_type": "markdown",
1413
+ "id": "sqgxsjv243",
1414
+ "metadata": {
1415
+ "papermill": {
1416
+ "duration": 0.002771,
1417
+ "end_time": "2026-02-02T13:02:55.861097",
1418
+ "exception": false,
1419
+ "start_time": "2026-02-02T13:02:55.858326",
1420
+ "status": "completed"
1421
+ },
1422
+ "tags": []
1423
+ },
1424
+ "source": [
1425
+ "> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
1426
+ "> The next notebook will automatically export this notebook's HTML documentation from the saved file."
1427
+ ]
1428
+ }
1429
+ ],
1430
+ "metadata": {
1431
+ "kernelspec": {
1432
+ "display_name": "Python 3",
1433
+ "language": "python",
1434
+ "name": "python3"
1435
+ },
1436
+ "language_info": {
1437
+ "codemirror_mode": {
1438
+ "name": "ipython",
1439
+ "version": 3
1440
+ },
1441
+ "file_extension": ".py",
1442
+ "mimetype": "text/x-python",
1443
+ "name": "python",
1444
+ "nbconvert_exporter": "python",
1445
+ "pygments_lexer": "ipython3",
1446
+ "version": "3.12.4"
1447
+ },
1448
+ "papermill": {
1449
+ "default_parameters": {},
1450
+ "duration": 10.628627,
1451
+ "end_time": "2026-02-02T13:02:58.480658",
1452
+ "environment_variables": {},
1453
+ "exception": null,
1454
+ "input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/01d_event_aggregation.ipynb",
1455
+ "output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/01d_event_aggregation.ipynb",
1456
+ "parameters": {},
1457
+ "start_time": "2026-02-02T13:02:47.852031",
1458
+ "version": "2.6.0"
1459
+ }
1460
+ },
1461
+ "nbformat": 4,
1462
+ "nbformat_minor": 5
1463
+ }