churnkit 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +10 -5
  2. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +11 -9
  3. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +52 -46
  4. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +68 -65
  5. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +12 -27
  6. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +216 -221
  7. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +88 -81
  8. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +111 -108
  9. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +44 -38
  10. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +89 -85
  11. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +81 -80
  12. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +83 -89
  13. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +102 -98
  14. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +32 -31
  15. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +33 -29
  16. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +6 -5
  17. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +67 -63
  18. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +38 -23
  19. {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +3 -1
  20. {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/METADATA +1 -1
  21. {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/RECORD +31 -31
  22. customer_retention/__init__.py +1 -1
  23. customer_retention/analysis/auto_explorer/explorer.py +2 -2
  24. customer_retention/analysis/notebook_progress.py +14 -2
  25. customer_retention/core/compat/__init__.py +10 -0
  26. customer_retention/core/config/experiments.py +45 -0
  27. customer_retention/integrations/databricks_init.py +41 -1
  28. customer_retention/stages/profiling/column_profiler.py +9 -2
  29. {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/WHEEL +0 -0
  30. {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/entry_points.txt +0 -0
  31. {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/licenses/LICENSE +0 -0
@@ -95,27 +95,31 @@
95
95
  "outputs": [],
96
96
  "source": [
97
97
  "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
98
+ "\n",
98
99
  "track_and_export_previous(\"01d_event_aggregation.ipynb\")\n",
99
100
  "\n",
100
- "from customer_retention.analysis.auto_explorer import ExplorationFindings, DataExplorer\n",
101
- "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
102
- "from customer_retention.core.config.column_config import ColumnType, DatasetGranularity\n",
101
+ "from datetime import datetime\n",
102
+ "from pathlib import Path\n",
103
+ "\n",
104
+ "import numpy as np\n",
105
+ "import pandas as pd\n",
106
+ "\n",
107
+ "from customer_retention.analysis.auto_explorer import DataExplorer, ExplorationFindings\n",
108
+ "from customer_retention.analysis.visualization import ChartBuilder\n",
109
+ "from customer_retention.core.config.experiments import (\n",
110
+ " EXPERIMENTS_DIR,\n",
111
+ " FINDINGS_DIR,\n",
112
+ ")\n",
103
113
  "from customer_retention.stages.profiling import (\n",
104
114
  " AggregationFeatureConfig,\n",
105
- " TimeWindowAggregator,\n",
106
115
  " TimeSeriesProfiler,\n",
116
+ " TimeWindowAggregator,\n",
107
117
  " classify_lifecycle_quadrants,\n",
108
- " classify_activity_segments,\n",
109
118
  " create_momentum_ratio_features,\n",
110
119
  " create_recency_bucket_feature,\n",
111
120
  " deduplicate_events,\n",
112
121
  " get_duplicate_event_count,\n",
113
- ")\n",
114
- "from datetime import datetime\n",
115
- "from pathlib import Path\n",
116
- "import pandas as pd\n",
117
- "import numpy as np\n",
118
- "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure"
122
+ ")"
119
123
  ]
120
124
  },
121
125
  {
@@ -145,7 +149,7 @@
145
149
  "\n",
146
150
  "# Find findings files (exclude multi_dataset and already-aggregated)\n",
147
151
  "findings_files = [\n",
148
- " f for f in FINDINGS_DIR.glob(\"*_findings.yaml\") \n",
152
+ " f for f in FINDINGS_DIR.glob(\"*_findings.yaml\")\n",
149
153
  " if \"multi_dataset\" not in f.name and \"_aggregated\" not in f.name\n",
150
154
  "]\n",
151
155
  "if not findings_files:\n",
@@ -212,7 +216,7 @@
212
216
  " print(\"\\n ⚠️ No window recommendations - will use defaults\")\n",
213
217
  "\n",
214
218
  "if ts_meta.temporal_segmentation_recommendation:\n",
215
- " print(f\"\\n 📋 Segmentation recommendation:\")\n",
219
+ " print(\"\\n 📋 Segmentation recommendation:\")\n",
216
220
  " print(f\" {ts_meta.temporal_segmentation_recommendation}\")\n",
217
221
  " if ts_meta.heterogeneity_level:\n",
218
222
  " print(f\" Heterogeneity: {ts_meta.heterogeneity_level}\")\n",
@@ -227,7 +231,7 @@
227
231
  "# === 01b: Temporal Quality ===\n",
228
232
  "quality_meta = findings.metadata.get(\"temporal_quality\", {})\n",
229
233
  "if quality_meta:\n",
230
- " print(f\"\\n📋 FROM 01b (Temporal Quality):\")\n",
234
+ " print(\"\\n📋 FROM 01b (Temporal Quality):\")\n",
231
235
  " if quality_meta.get(\"temporal_quality_score\"):\n",
232
236
  " print(f\" Quality score: {quality_meta.get('temporal_quality_score'):.1f}\")\n",
233
237
  " if quality_meta.get(\"temporal_quality_grade\"):\n",
@@ -246,7 +250,7 @@
246
250
  "COHORT_RECOMMENDATIONS = [] # Store for later application\n",
247
251
  "\n",
248
252
  "if pattern_meta:\n",
249
- " print(f\"\\n📈 FROM 01c (Temporal Patterns):\")\n",
253
+ " print(\"\\n📈 FROM 01c (Temporal Patterns):\")\n",
250
254
  " windows_used = pattern_meta.get(\"windows_used\", {})\n",
251
255
  " if windows_used:\n",
252
256
  " if windows_used.get(\"aggregation_windows\"):\n",
@@ -255,17 +259,17 @@
255
259
  " print(f\" Velocity window: {windows_used.get('velocity_window')} days\")\n",
256
260
  " if windows_used.get(\"momentum_pairs\"):\n",
257
261
  " print(f\" Momentum pairs: {windows_used.get('momentum_pairs')}\")\n",
258
- " \n",
262
+ "\n",
259
263
  " trend = pattern_meta.get(\"trend\", {})\n",
260
264
  " if trend and trend.get(\"direction\"):\n",
261
265
  " print(f\"\\n Trend: {trend.get('direction')} (strength: {trend.get('strength', 0):.2f})\")\n",
262
266
  " TREND_RECOMMENDATIONS = trend.get(\"recommendations\", [])\n",
263
267
  " trend_features = [r for r in TREND_RECOMMENDATIONS if r.get(\"features\")]\n",
264
268
  " if trend_features:\n",
265
- " print(f\"\\n 📈 Trend Features to Add:\")\n",
269
+ " print(\"\\n 📈 Trend Features to Add:\")\n",
266
270
  " for rec in trend_features:\n",
267
271
  " print(f\" → {', '.join(rec['features'])} ({rec['priority']} priority)\")\n",
268
- " \n",
272
+ "\n",
269
273
  " # Handle both old format (list) and new format (dict with patterns and recommendations)\n",
270
274
  " seasonality = pattern_meta.get(\"seasonality\", {})\n",
271
275
  " if isinstance(seasonality, list):\n",
@@ -274,14 +278,14 @@
274
278
  " else:\n",
275
279
  " patterns = seasonality.get(\"patterns\", [])\n",
276
280
  " SEASONALITY_RECOMMENDATIONS = seasonality.get(\"recommendations\", [])\n",
277
- " \n",
281
+ "\n",
278
282
  " if patterns:\n",
279
283
  " periods = [f\"{s.get('name', 'period')} ({s.get('period')}d)\" for s in patterns[:3]]\n",
280
284
  " print(f\" Seasonality: {', '.join(periods)}\")\n",
281
- " \n",
285
+ "\n",
282
286
  " # Display seasonality recommendations\n",
283
287
  " if SEASONALITY_RECOMMENDATIONS:\n",
284
- " print(f\"\\n 📋 Seasonality Recommendations:\")\n",
288
+ " print(\"\\n 📋 Seasonality Recommendations:\")\n",
285
289
  " for rec in SEASONALITY_RECOMMENDATIONS:\n",
286
290
  " action = rec.get(\"action\", \"\").replace(\"_\", \" \")\n",
287
291
  " if action == \"add cyclical feature\":\n",
@@ -292,18 +296,18 @@
292
296
  " print(f\" → Warning: Windows don't align with cycles {rec.get('detected_periods')}\")\n",
293
297
  " elif action == \"consider deseasonalization\":\n",
294
298
  " print(f\" → Consider deseasonalizing for periods {rec.get('periods')}\")\n",
295
- " \n",
299
+ "\n",
296
300
  " recency = pattern_meta.get(\"recency\", {})\n",
297
301
  " if recency and recency.get(\"median_days\"):\n",
298
302
  " print(f\" Recency: median={recency.get('median_days'):.0f} days, \"\n",
299
303
  " f\"target_corr={recency.get('target_correlation', 0):.2f}\")\n",
300
- " \n",
304
+ "\n",
301
305
  " # Divergent columns (important for feature prioritization)\n",
302
306
  " velocity = pattern_meta.get(\"velocity\", {})\n",
303
307
  " divergent_velocity = [k for k, v in velocity.items() if isinstance(v, dict) and v.get(\"divergent\")]\n",
304
308
  " if divergent_velocity:\n",
305
309
  " print(f\"\\n 🎯 Divergent velocity columns: {divergent_velocity}\")\n",
306
- " \n",
310
+ "\n",
307
311
  " momentum = pattern_meta.get(\"momentum\", {})\n",
308
312
  " divergent_momentum = momentum.get(\"_divergent_columns\", [])\n",
309
313
  " if divergent_momentum:\n",
@@ -320,7 +324,7 @@
320
324
  " else:\n",
321
325
  " cohort_features = [r for r in COHORT_RECOMMENDATIONS if r.get(\"features\")]\n",
322
326
  " if cohort_features:\n",
323
- " print(f\"\\n 👥 Cohort Features to Add:\")\n",
327
+ " print(\"\\n 👥 Cohort Features to Add:\")\n",
324
328
  " for rec in cohort_features:\n",
325
329
  " print(f\" → {', '.join(rec['features'])} ({rec['priority']} priority)\")\n",
326
330
  "\n",
@@ -365,7 +369,7 @@
365
369
  },
366
370
  "outputs": [],
367
371
  "source": [
368
- "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
372
+ "from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
369
373
  "\n",
370
374
  "# Load source data (prefers snapshots over raw files)\n",
371
375
  "df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
@@ -479,7 +483,7 @@
479
483
  "momentum_meta = pattern_meta.get(\"momentum\", {})\n",
480
484
  "\n",
481
485
  "# Identify divergent columns (these are most predictive for target)\n",
482
- "DIVERGENT_VELOCITY_COLS = [k for k, v in velocity_meta.items() \n",
486
+ "DIVERGENT_VELOCITY_COLS = [k for k, v in velocity_meta.items()\n",
483
487
  " if isinstance(v, dict) and v.get(\"divergent\")]\n",
484
488
  "DIVERGENT_MOMENTUM_COLS = momentum_meta.get(\"_divergent_columns\", [])\n",
485
489
  "\n",
@@ -493,7 +497,7 @@
493
497
  "available_numeric = [c for c in numeric_cols if c not in exclude_cols]\n",
494
498
  "\n",
495
499
  "# Put divergent columns first (they showed predictive signal in 01c)\n",
496
- "priority_cols = [c for c in DIVERGENT_VELOCITY_COLS + DIVERGENT_MOMENTUM_COLS \n",
500
+ "priority_cols = [c for c in DIVERGENT_VELOCITY_COLS + DIVERGENT_MOMENTUM_COLS\n",
497
501
  " if c in available_numeric]\n",
498
502
  "other_cols = [c for c in available_numeric if c not in priority_cols]\n",
499
503
  "\n",
@@ -535,7 +539,7 @@
535
539
  "if TARGET_COLUMN:\n",
536
540
  " print(f\"\\n Excluded from aggregation: {TARGET_COLUMN} (target - prevents leakage)\")\n",
537
541
  "print(f\"\\nAggregation functions: {AGG_FUNCTIONS}\")\n",
538
- "print(f\"\\nAdditional features:\")\n",
542
+ "print(\"\\nAdditional features:\")\n",
539
543
  "print(f\" Include lifecycle_quadrant: {INCLUDE_LIFECYCLE_QUADRANT}\")\n",
540
544
  "print(f\" Include recency: {INCLUDE_RECENCY}\")\n",
541
545
  "print(f\" Include tenure: {INCLUDE_TENURE}\")\n",
@@ -630,10 +634,10 @@
630
634
  " print(f\" ... and {len(plan.feature_columns) - 15} more\")\n",
631
635
  "\n",
632
636
  "if additional_features:\n",
633
- " print(f\"\\nAdditional features:\")\n",
637
+ " print(\"\\nAdditional features:\")\n",
634
638
  " for feat in additional_features:\n",
635
639
  " print(f\" - {feat}\")\n",
636
- " \n",
640
+ "\n",
637
641
  "print(f\"\\nTotal expected features: {len(plan.feature_columns) + len(additional_features) + 1}\")"
638
642
  ]
639
643
  },
@@ -697,18 +701,18 @@
697
701
  " print(\"\\n Adding lifecycle_quadrant feature...\")\n",
698
702
  " profiler = TimeSeriesProfiler(entity_column=ENTITY_COLUMN, time_column=TIME_COLUMN)\n",
699
703
  " ts_profile = profiler.profile(df)\n",
700
- " \n",
704
+ "\n",
701
705
  " # Rename 'entity' column to match our entity column name\n",
702
706
  " lifecycles = ts_profile.entity_lifecycles.copy()\n",
703
707
  " lifecycles = lifecycles.rename(columns={\"entity\": ENTITY_COLUMN})\n",
704
- " \n",
708
+ "\n",
705
709
  " quadrant_result = classify_lifecycle_quadrants(lifecycles)\n",
706
- " \n",
710
+ "\n",
707
711
  " # Merge lifecycle_quadrant into aggregated data\n",
708
712
  " quadrant_map = quadrant_result.lifecycles.set_index(ENTITY_COLUMN)[\"lifecycle_quadrant\"]\n",
709
713
  " df_aggregated[\"lifecycle_quadrant\"] = df_aggregated[ENTITY_COLUMN].map(quadrant_map)\n",
710
- " \n",
711
- " print(f\" Quadrant distribution:\")\n",
714
+ "\n",
715
+ " print(\" Quadrant distribution:\")\n",
712
716
  " for quad, count in df_aggregated[\"lifecycle_quadrant\"].value_counts().items():\n",
713
717
  " pct = count / len(df_aggregated) * 100\n",
714
718
  " print(f\" {quad}: {count:,} ({pct:.1f}%)\")\n",
@@ -720,7 +724,7 @@
720
724
  " # For entity-level target, use max (if any event has target=1, entity has target=1)\n",
721
725
  " entity_target = df.groupby(ENTITY_COLUMN)[TARGET_COLUMN].max()\n",
722
726
  " df_aggregated[TARGET_COLUMN] = df_aggregated[ENTITY_COLUMN].map(entity_target)\n",
723
- " \n",
727
+ "\n",
724
728
  " target_dist = df_aggregated[TARGET_COLUMN].value_counts()\n",
725
729
  " for val, count in target_dist.items():\n",
726
730
  " pct = count / len(df_aggregated) * 100\n",
@@ -753,9 +757,9 @@
753
757
  " df_aggregated[\"quarter_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_quarter) / 4)\n",
754
758
  " df_aggregated[\"quarter_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_quarter) / 4)\n",
755
759
  " cyclical_added.append(\"quarter (quarter_sin, quarter_cos)\")\n",
756
- " \n",
760
+ "\n",
757
761
  " if cyclical_added:\n",
758
- " print(f\"\\n Adding cyclical features from seasonality analysis:\")\n",
762
+ " print(\"\\n Adding cyclical features from seasonality analysis:\")\n",
759
763
  " for feat in cyclical_added:\n",
760
764
  " print(f\" -> {feat}\")\n",
761
765
  "\n",
@@ -765,21 +769,21 @@
765
769
  " for rec in TEMPORAL_PATTERN_RECOMMENDATIONS:\n",
766
770
  " features = rec.get(\"features\", [])\n",
767
771
  " pattern = rec.get(\"pattern\", \"\")\n",
768
- " \n",
772
+ "\n",
769
773
  " if pattern == \"day_of_week\" and \"dow_sin\" in df_aggregated.columns:\n",
770
774
  " continue\n",
771
775
  " if pattern == \"month\" and \"month_sin\" in df_aggregated.columns:\n",
772
776
  " continue\n",
773
777
  " if pattern == \"quarter\" and \"quarter_sin\" in df_aggregated.columns:\n",
774
778
  " continue\n",
775
- " \n",
779
+ "\n",
776
780
  " if \"dow_sin\" in features or \"dow_cos\" in features:\n",
777
781
  " if \"dow_sin\" not in df_aggregated.columns:\n",
778
782
  " entity_dow = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.dayofweek.mean())\n",
779
783
  " df_aggregated[\"dow_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_dow) / 7)\n",
780
784
  " df_aggregated[\"dow_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_dow) / 7)\n",
781
785
  " tp_added.append(\"day_of_week (dow_sin, dow_cos)\")\n",
782
- " \n",
786
+ "\n",
783
787
  " if \"is_weekend\" in features:\n",
784
788
  " if \"is_weekend\" not in df_aggregated.columns:\n",
785
789
  " entity_weekend_pct = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(\n",
@@ -787,28 +791,28 @@
787
791
  " )\n",
788
792
  " df_aggregated[\"is_weekend_pct\"] = df_aggregated[ENTITY_COLUMN].map(entity_weekend_pct)\n",
789
793
  " tp_added.append(\"is_weekend_pct\")\n",
790
- " \n",
794
+ "\n",
791
795
  " if \"month_sin\" in features or \"month_cos\" in features:\n",
792
796
  " if \"month_sin\" not in df_aggregated.columns:\n",
793
797
  " entity_month = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.month.mean())\n",
794
798
  " df_aggregated[\"month_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_month) / 12)\n",
795
799
  " df_aggregated[\"month_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_month) / 12)\n",
796
800
  " tp_added.append(\"month (month_sin, month_cos)\")\n",
797
- " \n",
801
+ "\n",
798
802
  " if \"quarter_sin\" in features or \"quarter_cos\" in features:\n",
799
803
  " if \"quarter_sin\" not in df_aggregated.columns:\n",
800
804
  " entity_quarter = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.quarter.mean())\n",
801
805
  " df_aggregated[\"quarter_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_quarter) / 4)\n",
802
806
  " df_aggregated[\"quarter_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_quarter) / 4)\n",
803
807
  " tp_added.append(\"quarter (quarter_sin, quarter_cos)\")\n",
804
- " \n",
808
+ "\n",
805
809
  " if \"year_trend\" in features:\n",
806
810
  " if \"year_trend\" not in df_aggregated.columns:\n",
807
811
  " entity_year = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.year.mean())\n",
808
812
  " min_year = entity_year.min()\n",
809
813
  " df_aggregated[\"year_trend\"] = df_aggregated[ENTITY_COLUMN].map(entity_year) - min_year\n",
810
814
  " tp_added.append(f\"year_trend (normalized from {min_year:.0f})\")\n",
811
- " \n",
815
+ "\n",
812
816
  " if \"year_categorical\" in features:\n",
813
817
  " if \"year_mode\" not in df_aggregated.columns:\n",
814
818
  " entity_year_mode = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(\n",
@@ -816,9 +820,9 @@
816
820
  " )\n",
817
821
  " df_aggregated[\"year_mode\"] = df_aggregated[ENTITY_COLUMN].map(entity_year_mode).astype(int)\n",
818
822
  " tp_added.append(\"year_mode (categorical - encode before modeling)\")\n",
819
- " \n",
823
+ "\n",
820
824
  " if tp_added:\n",
821
- " print(f\"\\n Adding features from temporal pattern analysis:\")\n",
825
+ " print(\"\\n Adding features from temporal pattern analysis:\")\n",
822
826
  " for feat in tp_added:\n",
823
827
  " print(f\" -> {feat}\")\n",
824
828
  "\n",
@@ -827,20 +831,20 @@
827
831
  " trend_added = []\n",
828
832
  " for rec in TREND_RECOMMENDATIONS:\n",
829
833
  " features = rec.get(\"features\", [])\n",
830
- " \n",
834
+ "\n",
831
835
  " if \"recent_vs_overall_ratio\" in features:\n",
832
836
  " if \"recent_vs_overall_ratio\" not in df_aggregated.columns:\n",
833
837
  " time_span = (df[TIME_COLUMN].max() - df[TIME_COLUMN].min()).days\n",
834
838
  " recent_cutoff = df[TIME_COLUMN].max() - pd.Timedelta(days=int(time_span * 0.3))\n",
835
- " \n",
839
+ "\n",
836
840
  " overall_counts = df.groupby(ENTITY_COLUMN).size()\n",
837
841
  " recent_counts = df[df[TIME_COLUMN] >= recent_cutoff].groupby(ENTITY_COLUMN).size()\n",
838
- " \n",
842
+ "\n",
839
843
  " ratio = recent_counts / overall_counts\n",
840
844
  " ratio = ratio.fillna(0)\n",
841
845
  " df_aggregated[\"recent_vs_overall_ratio\"] = df_aggregated[ENTITY_COLUMN].map(ratio).fillna(0)\n",
842
846
  " trend_added.append(\"recent_vs_overall_ratio\")\n",
843
- " \n",
847
+ "\n",
844
848
  " if \"entity_trend_slope\" in features:\n",
845
849
  " if \"entity_trend_slope\" not in df_aggregated.columns:\n",
846
850
  " def compute_entity_slope(group):\n",
@@ -852,13 +856,13 @@
852
856
  " return 0.0\n",
853
857
  " slope = np.polyfit(x, y, 1)[0]\n",
854
858
  " return slope\n",
855
- " \n",
859
+ "\n",
856
860
  " entity_slopes = df.groupby(ENTITY_COLUMN).apply(compute_entity_slope)\n",
857
861
  " df_aggregated[\"entity_trend_slope\"] = df_aggregated[ENTITY_COLUMN].map(entity_slopes).fillna(0)\n",
858
862
  " trend_added.append(\"entity_trend_slope\")\n",
859
- " \n",
863
+ "\n",
860
864
  " if trend_added:\n",
861
- " print(f\"\\n Adding features from trend analysis:\")\n",
865
+ " print(\"\\n Adding features from trend analysis:\")\n",
862
866
  " for feat in trend_added:\n",
863
867
  " print(f\" -> {feat}\")\n",
864
868
  "\n",
@@ -868,25 +872,25 @@
868
872
  " if not skip_cohort:\n",
869
873
  " cohort_added = []\n",
870
874
  " cohort_features = [f for r in COHORT_RECOMMENDATIONS for f in r.get(\"features\", [])]\n",
871
- " \n",
875
+ "\n",
872
876
  " if \"cohort_year\" in cohort_features or \"cohort_quarter\" in cohort_features:\n",
873
877
  " entity_first = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].min()\n",
874
- " \n",
878
+ "\n",
875
879
  " if \"cohort_year\" in cohort_features and \"cohort_year\" not in df_aggregated.columns:\n",
876
880
  " df_aggregated[\"cohort_year\"] = df_aggregated[ENTITY_COLUMN].map(entity_first).dt.year\n",
877
881
  " cohort_added.append(\"cohort_year\")\n",
878
- " \n",
882
+ "\n",
879
883
  " if \"cohort_quarter\" in cohort_features and \"cohort_quarter\" not in df_aggregated.columns:\n",
880
884
  " first_dates = df_aggregated[ENTITY_COLUMN].map(entity_first)\n",
881
885
  " df_aggregated[\"cohort_quarter\"] = first_dates.dt.year.astype(str) + \"Q\" + first_dates.dt.quarter.astype(str)\n",
882
886
  " cohort_added.append(\"cohort_quarter\")\n",
883
- " \n",
887
+ "\n",
884
888
  " if cohort_added:\n",
885
- " print(f\"\\n Adding cohort features:\")\n",
889
+ " print(\"\\n Adding cohort features:\")\n",
886
890
  " for feat in cohort_added:\n",
887
891
  " print(f\" -> {feat}\")\n",
888
892
  " else:\n",
889
- " print(f\"\\n Skipping cohort features (insufficient variation)\")\n",
893
+ " print(\"\\n Skipping cohort features (insufficient variation)\")\n",
890
894
  "\n",
891
895
  "# Step 8: Add momentum ratio features from 01c momentum recommendations\n",
892
896
  "if MOMENTUM_RECOMMENDATIONS:\n",
@@ -894,24 +898,26 @@
894
898
  " df_aggregated = create_momentum_ratio_features(df_aggregated, MOMENTUM_RECOMMENDATIONS)\n",
895
899
  " new_momentum_cols = set(df_aggregated.columns) - before_cols\n",
896
900
  " if new_momentum_cols:\n",
897
- " print(f\"\\n Adding momentum ratio features:\")\n",
901
+ " print(\"\\n Adding momentum ratio features:\")\n",
898
902
  " for feat in sorted(new_momentum_cols):\n",
899
903
  " print(f\" -> {feat}\")\n",
900
904
  " else:\n",
901
- " print(f\"\\n Momentum ratio features: columns not available in aggregated data (skipped)\")\n",
905
+ " print(\"\\n Momentum ratio features: columns not available in aggregated data (skipped)\")\n",
902
906
  "\n",
903
907
  "# Step 9: Add recency bucket feature\n",
904
908
  "if INCLUDE_RECENCY and \"days_since_last_event\" in df_aggregated.columns:\n",
905
909
  " df_aggregated = create_recency_bucket_feature(df_aggregated)\n",
906
910
  " if \"recency_bucket\" in df_aggregated.columns:\n",
907
- " print(f\"\\n Adding recency_bucket feature:\")\n",
911
+ " print(\"\\n Adding recency_bucket feature:\")\n",
908
912
  " for bucket, count in df_aggregated[\"recency_bucket\"].value_counts().sort_index().items():\n",
909
913
  " pct = count / len(df_aggregated) * 100\n",
910
914
  " print(f\" {bucket}: {count:,} ({pct:.1f}%)\")\n",
911
915
  "\n",
912
- "print(f\"\\n Aggregation complete!\")\n",
916
+ "print(\"\\n Aggregation complete!\")\n",
913
917
  "print(f\" Output: {len(df_aggregated):,} entities x {len(df_aggregated.columns)} features\")\n",
914
- "print(f\" Memory: {df_aggregated.memory_usage(deep=True).sum() / 1024**2:.1f} MB\")"
918
+ "from customer_retention.core.compat import safe_memory_usage_bytes\n",
919
+ "\n",
920
+ "print(f\" Memory: {safe_memory_usage_bytes(df_aggregated) / 1024**2:.1f} MB\")"
915
921
  ]
916
922
  },
917
923
  {
@@ -1036,12 +1042,12 @@
1036
1042
  "if original_entities == aggregated_entities:\n",
1037
1043
  " print(f\"\\n✅ Entity count matches: {aggregated_entities:,}\")\n",
1038
1044
  "else:\n",
1039
- " print(f\"\\n⚠️ Entity count mismatch!\")\n",
1045
+ " print(\"\\n⚠️ Entity count mismatch!\")\n",
1040
1046
  " print(f\" Original: {original_entities:,}\")\n",
1041
1047
  " print(f\" Aggregated: {aggregated_entities:,}\")\n",
1042
1048
  "\n",
1043
1049
  "# Check feature statistics\n",
1044
- "print(f\"\\n📊 Feature Statistics:\")\n",
1050
+ "print(\"\\n📊 Feature Statistics:\")\n",
1045
1051
  "numeric_agg_cols = df_aggregated.select_dtypes(include=[np.number]).columns.tolist()\n",
1046
1052
  "if TARGET_COLUMN:\n",
1047
1053
  " numeric_agg_cols = [c for c in numeric_agg_cols if c != TARGET_COLUMN]\n",
@@ -1057,7 +1063,7 @@
1057
1063
  "\n",
1058
1064
  "# If lifecycle_quadrant was added, show its correlation with target\n",
1059
1065
  "if INCLUDE_LIFECYCLE_QUADRANT and TARGET_COLUMN and TARGET_COLUMN in df_aggregated.columns:\n",
1060
- " print(f\"\\n📊 Lifecycle Quadrant vs Target:\")\n",
1066
+ " print(\"\\n📊 Lifecycle Quadrant vs Target:\")\n",
1061
1067
  " cross = pd.crosstab(df_aggregated[\"lifecycle_quadrant\"], df_aggregated[TARGET_COLUMN], normalize='index')\n",
1062
1068
  " if 1 in cross.columns:\n",
1063
1069
  " for quad in cross.index:\n",
@@ -1200,6 +1206,7 @@
1200
1206
  "print(f\"✅ Original findings updated with aggregation metadata: {FINDINGS_PATH}\")\n",
1201
1207
  "\n",
1202
1208
  "from customer_retention.analysis.notebook_html_exporter import export_notebook_html\n",
1209
+ "\n",
1203
1210
  "export_notebook_html(Path(\"01d_event_aggregation.ipynb\"), EXPERIMENTS_DIR / \"docs\")\n"
1204
1211
  ]
1205
1212
  },
@@ -1230,25 +1237,25 @@
1230
1237
  "print(\"AGGREGATION COMPLETE - OUTPUT SUMMARY\")\n",
1231
1238
  "print(\"=\"*70)\n",
1232
1239
  "\n",
1233
- "print(f\"\\n📁 Files created:\")\n",
1240
+ "print(\"\\n📁 Files created:\")\n",
1234
1241
  "print(f\" 1. Aggregated data: {AGGREGATED_DATA_PATH}\")\n",
1235
1242
  "print(f\" 2. Aggregated findings: {AGGREGATED_FINDINGS_PATH}\")\n",
1236
1243
  "print(f\" 3. Updated original findings: {FINDINGS_PATH}\")\n",
1237
1244
  "\n",
1238
- "print(f\"\\n📊 Transformation stats:\")\n",
1245
+ "print(\"\\n📊 Transformation stats:\")\n",
1239
1246
  "print(f\" Input events: {len(df):,}\")\n",
1240
1247
  "print(f\" Output entities: {len(df_aggregated):,}\")\n",
1241
1248
  "print(f\" Features created: {len(df_aggregated.columns)}\")\n",
1242
1249
  "\n",
1243
- "print(f\"\\n⚙️ Configuration applied:\")\n",
1250
+ "print(\"\\n⚙️ Configuration applied:\")\n",
1244
1251
  "print(f\" Windows: {WINDOWS} (from {window_source})\")\n",
1245
1252
  "print(f\" Aggregation functions: {AGG_FUNCTIONS}\")\n",
1246
1253
  "if priority_cols:\n",
1247
1254
  " print(f\" Priority columns (from 01c divergence): {priority_cols}\")\n",
1248
1255
  "if INCLUDE_LIFECYCLE_QUADRANT:\n",
1249
- " print(f\" Lifecycle quadrant: included (from 01a recommendation)\")\n",
1256
+ " print(\" Lifecycle quadrant: included (from 01a recommendation)\")\n",
1250
1257
  "\n",
1251
- "print(f\"\\n🎯 Ready for modeling:\")\n",
1258
+ "print(\"\\n🎯 Ready for modeling:\")\n",
1252
1259
  "print(f\" Entity column: {ENTITY_COLUMN}\")\n",
1253
1260
  "if TARGET_COLUMN:\n",
1254
1261
  " print(f\" Target column: {TARGET_COLUMN}\")\n",
@@ -1258,9 +1265,9 @@
1258
1265
  "\n",
1259
1266
  "# Drift warning if applicable\n",
1260
1267
  "if ts_meta.drift_risk_level == \"high\":\n",
1261
- " print(f\"\\n⚠️ DRIFT WARNING: High drift risk detected in 01a\")\n",
1268
+ " print(\"\\n⚠️ DRIFT WARNING: High drift risk detected in 01a\")\n",
1262
1269
  " print(f\" Volume drift: {ts_meta.volume_drift_risk or 'unknown'}\")\n",
1263
- " print(f\" Consider: temporal validation splits, monitoring for distribution shift\")"
1270
+ " print(\" Consider: temporal validation splits, monitoring for distribution shift\")"
1264
1271
  ]
1265
1272
  },
1266
1273
  {
@@ -1317,19 +1324,19 @@
1317
1324
  "\n",
1318
1325
  "if TARGET_COLUMN and TARGET_COLUMN in df_aggregated.columns:\n",
1319
1326
  " detector = LeakageDetector()\n",
1320
- " \n",
1327
+ "\n",
1321
1328
  " # Separate features and target\n",
1322
1329
  " feature_cols = [c for c in df_aggregated.columns if c not in [ENTITY_COLUMN, TARGET_COLUMN]]\n",
1323
1330
  " X = df_aggregated[feature_cols]\n",
1324
1331
  " y = df_aggregated[TARGET_COLUMN]\n",
1325
- " \n",
1332
+ "\n",
1326
1333
  " # Run leakage checks\n",
1327
1334
  " result = detector.run_all_checks(X, y, include_pit=False)\n",
1328
- " \n",
1335
+ "\n",
1329
1336
  " print(\"=\" * 70)\n",
1330
1337
  " print(\"LEAKAGE VALIDATION RESULTS\")\n",
1331
1338
  " print(\"=\" * 70)\n",
1332
- " \n",
1339
+ "\n",
1333
1340
  " if result.passed:\n",
1334
1341
  " print(\"\\n✅ PASSED: No critical leakage issues detected\")\n",
1335
1342
  " print(f\" Total checks run: {len(result.checks)}\")\n",
@@ -1460,4 +1467,4 @@
1460
1467
  },
1461
1468
  "nbformat": 4,
1462
1469
  "nbformat_minor": 5
1463
- }
1470
+ }