churnkit 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +10 -5
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +11 -9
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +52 -46
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +68 -65
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +12 -27
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +216 -221
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +88 -81
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +111 -108
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +44 -38
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +89 -85
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +81 -80
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +83 -89
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +102 -98
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +32 -31
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +33 -29
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +6 -5
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +67 -63
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +38 -23
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +3 -1
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/METADATA +1 -1
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/RECORD +31 -31
- customer_retention/__init__.py +1 -1
- customer_retention/analysis/auto_explorer/explorer.py +2 -2
- customer_retention/analysis/notebook_progress.py +14 -2
- customer_retention/core/compat/__init__.py +10 -0
- customer_retention/core/config/experiments.py +45 -0
- customer_retention/integrations/databricks_init.py +41 -1
- customer_retention/stages/profiling/column_profiler.py +9 -2
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/WHEEL +0 -0
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/entry_points.txt +0 -0
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -82,20 +82,23 @@
|
|
|
82
82
|
"outputs": [],
|
|
83
83
|
"source": [
|
|
84
84
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
85
|
+
"\n",
|
|
85
86
|
"track_and_export_previous(\"06_feature_opportunities.ipynb\")\n",
|
|
86
87
|
"\n",
|
|
87
|
-
"from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationEngine, RecommendationRegistry\n",
|
|
88
|
-
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
89
|
-
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
90
|
-
"from customer_retention.stages.features import CustomerSegmenter, SegmentationType\n",
|
|
91
|
-
"from customer_retention.stages.profiling import FeatureCapacityAnalyzer\n",
|
|
92
|
-
"import yaml\n",
|
|
93
|
-
"import pandas as pd\n",
|
|
94
88
|
"import numpy as np\n",
|
|
89
|
+
"import pandas as pd\n",
|
|
95
90
|
"import plotly.graph_objects as go\n",
|
|
96
|
-
"import
|
|
97
|
-
"
|
|
98
|
-
"from customer_retention.
|
|
91
|
+
"import yaml\n",
|
|
92
|
+
"\n",
|
|
93
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationEngine, RecommendationRegistry\n",
|
|
94
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure\n",
|
|
95
|
+
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
96
|
+
"from customer_retention.core.config.experiments import (\n",
|
|
97
|
+
" EXPERIMENTS_DIR,\n",
|
|
98
|
+
" FINDINGS_DIR,\n",
|
|
99
|
+
")\n",
|
|
100
|
+
"from customer_retention.stages.features import CustomerSegmenter\n",
|
|
101
|
+
"from customer_retention.stages.profiling import FeatureCapacityAnalyzer\n"
|
|
99
102
|
]
|
|
100
103
|
},
|
|
101
104
|
{
|
|
@@ -158,7 +161,7 @@
|
|
|
158
161
|
"findings = ExplorationFindings.load(FINDINGS_PATH)\n",
|
|
159
162
|
"\n",
|
|
160
163
|
"# Load data - handle aggregated vs standard paths\n",
|
|
161
|
-
"from customer_retention.stages.temporal import
|
|
164
|
+
"from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
|
|
162
165
|
"\n",
|
|
163
166
|
"# For aggregated data, load directly from the parquet source\n",
|
|
164
167
|
"if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
|
|
@@ -338,53 +341,53 @@
|
|
|
338
341
|
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
339
342
|
" and name != findings.target_column\n",
|
|
340
343
|
" ]\n",
|
|
341
|
-
"
|
|
344
|
+
"\n",
|
|
342
345
|
" capacity_result = capacity_analyzer.analyze(\n",
|
|
343
346
|
" df,\n",
|
|
344
347
|
" feature_cols=numeric_features,\n",
|
|
345
348
|
" target_col=findings.target_column,\n",
|
|
346
349
|
" )\n",
|
|
347
|
-
"
|
|
348
|
-
" print(
|
|
350
|
+
"\n",
|
|
351
|
+
" print(\"\\n📊 DATA SUMMARY:\")\n",
|
|
349
352
|
" print(f\" Total samples: {capacity_result.total_samples:,}\")\n",
|
|
350
353
|
" print(f\" Minority class samples: {capacity_result.minority_class_samples:,}\")\n",
|
|
351
354
|
" print(f\" Minority class rate: {capacity_result.minority_class_samples/capacity_result.total_samples:.1%}\")\n",
|
|
352
355
|
" print(f\" Current numeric features: {capacity_result.total_features}\")\n",
|
|
353
|
-
"
|
|
354
|
-
" print(
|
|
356
|
+
"\n",
|
|
357
|
+
" print(\"\\n📈 FEATURE CAPACITY METRICS:\")\n",
|
|
355
358
|
" print(f\" Events Per Variable (EPV): {capacity_result.events_per_variable:.1f}\")\n",
|
|
356
359
|
" print(f\" Samples Per Feature: {capacity_result.samples_per_feature:.1f}\")\n",
|
|
357
360
|
" print(f\" Capacity Status: {capacity_result.capacity_status.upper()}\")\n",
|
|
358
|
-
"
|
|
361
|
+
"\n",
|
|
359
362
|
" # Capacity status visualization\n",
|
|
360
363
|
" status_colors = {\"adequate\": \"#2ecc71\", \"limited\": \"#f39c12\", \"inadequate\": \"#e74c3c\"}\n",
|
|
361
364
|
" status_color = status_colors.get(capacity_result.capacity_status, \"#95a5a6\")\n",
|
|
362
|
-
"
|
|
363
|
-
" print(
|
|
365
|
+
"\n",
|
|
366
|
+
" print(\"\\n🎯 RECOMMENDED FEATURE COUNTS:\")\n",
|
|
364
367
|
" print(f\" Conservative (EPV=20): {capacity_result.recommended_features_conservative} features\")\n",
|
|
365
368
|
" print(f\" Moderate (EPV=10): {capacity_result.recommended_features_moderate} features\")\n",
|
|
366
369
|
" print(f\" Aggressive (EPV=5): {capacity_result.recommended_features_aggressive} features\")\n",
|
|
367
|
-
"
|
|
370
|
+
"\n",
|
|
368
371
|
" # Effective features analysis\n",
|
|
369
372
|
" if capacity_result.effective_features_result:\n",
|
|
370
373
|
" eff = capacity_result.effective_features_result\n",
|
|
371
|
-
" print(
|
|
374
|
+
" print(\"\\n🔍 EFFECTIVE FEATURES (accounting for correlation):\")\n",
|
|
372
375
|
" print(f\" Total features analyzed: {eff.total_count}\")\n",
|
|
373
376
|
" print(f\" Effective independent features: {eff.effective_count:.1f}\")\n",
|
|
374
377
|
" print(f\" Redundant features identified: {len(eff.redundant_features)}\")\n",
|
|
375
|
-
"
|
|
378
|
+
"\n",
|
|
376
379
|
" if eff.redundant_features:\n",
|
|
377
|
-
" print(
|
|
380
|
+
" print(\"\\n ⚠️ Redundant features (highly correlated):\")\n",
|
|
378
381
|
" for feat in eff.redundant_features[:5]:\n",
|
|
379
382
|
" print(f\" • {feat}\")\n",
|
|
380
|
-
"
|
|
383
|
+
"\n",
|
|
381
384
|
" if eff.feature_clusters:\n",
|
|
382
385
|
" print(f\"\\n 📦 Correlated feature clusters ({len(eff.feature_clusters)}):\")\n",
|
|
383
386
|
" for i, cluster in enumerate(eff.feature_clusters[:3]):\n",
|
|
384
387
|
" print(f\" Cluster {i+1}: {', '.join(cluster[:4])}\")\n",
|
|
385
388
|
" if len(cluster) > 4:\n",
|
|
386
389
|
" print(f\" ... and {len(cluster)-4} more\")\n",
|
|
387
|
-
"
|
|
390
|
+
"\n",
|
|
388
391
|
" # Persist feature capacity to registry\n",
|
|
389
392
|
" registry.add_bronze_feature_capacity(\n",
|
|
390
393
|
" epv=capacity_result.events_per_variable,\n",
|
|
@@ -394,8 +397,8 @@
|
|
|
394
397
|
" rationale=f\"EPV={capacity_result.events_per_variable:.1f}, status={capacity_result.capacity_status}\",\n",
|
|
395
398
|
" source_notebook=\"06_feature_opportunities\"\n",
|
|
396
399
|
" )\n",
|
|
397
|
-
" print(
|
|
398
|
-
"
|
|
400
|
+
" print(\"\\n✅ Persisted feature capacity recommendation to registry\")\n",
|
|
401
|
+
"\n",
|
|
399
402
|
" # Store capacity info in findings\n",
|
|
400
403
|
" findings.metadata[\"feature_capacity\"] = capacity_result.to_dict()\n",
|
|
401
404
|
"else:\n",
|
|
@@ -446,20 +449,20 @@
|
|
|
446
449
|
"# Model Complexity Guidance\n",
|
|
447
450
|
"if findings.target_column and 'capacity_result' in dir():\n",
|
|
448
451
|
" guidance = capacity_result.complexity_guidance\n",
|
|
449
|
-
"
|
|
452
|
+
"\n",
|
|
450
453
|
" print(\"=\" * 70)\n",
|
|
451
454
|
" print(\"MODEL COMPLEXITY GUIDANCE\")\n",
|
|
452
455
|
" print(\"=\" * 70)\n",
|
|
453
|
-
"
|
|
456
|
+
"\n",
|
|
454
457
|
" # Create visualization of feature limits by model type\n",
|
|
455
458
|
" model_types = [\"Linear\\n(no regularization)\", \"Regularized\\n(L1/L2)\", \"Tree-based\\n(RF/XGBoost)\"]\n",
|
|
456
459
|
" max_features = [guidance.max_features_linear, guidance.max_features_regularized, guidance.max_features_tree]\n",
|
|
457
460
|
" current_features = capacity_result.total_features\n",
|
|
458
|
-
"
|
|
461
|
+
"\n",
|
|
459
462
|
" colors = ['#e74c3c' if m < current_features else '#2ecc71' for m in max_features]\n",
|
|
460
|
-
"
|
|
463
|
+
"\n",
|
|
461
464
|
" fig = go.Figure()\n",
|
|
462
|
-
"
|
|
465
|
+
"\n",
|
|
463
466
|
" fig.add_trace(go.Bar(\n",
|
|
464
467
|
" x=model_types,\n",
|
|
465
468
|
" y=max_features,\n",
|
|
@@ -468,7 +471,7 @@
|
|
|
468
471
|
" textposition='outside',\n",
|
|
469
472
|
" name='Max Features'\n",
|
|
470
473
|
" ))\n",
|
|
471
|
-
"
|
|
474
|
+
"\n",
|
|
472
475
|
" # Add horizontal line for current feature count\n",
|
|
473
476
|
" fig.add_hline(\n",
|
|
474
477
|
" y=current_features,\n",
|
|
@@ -477,7 +480,7 @@
|
|
|
477
480
|
" annotation_text=f\"Current: {current_features}\",\n",
|
|
478
481
|
" annotation_position=\"right\"\n",
|
|
479
482
|
" )\n",
|
|
480
|
-
"
|
|
483
|
+
"\n",
|
|
481
484
|
" # Calculate y-axis range to fit labels\n",
|
|
482
485
|
" max_val = max(max_features)\n",
|
|
483
486
|
" fig.update_layout(\n",
|
|
@@ -489,19 +492,19 @@
|
|
|
489
492
|
" height=400,\n",
|
|
490
493
|
" showlegend=False,\n",
|
|
491
494
|
" )\n",
|
|
492
|
-
"
|
|
495
|
+
"\n",
|
|
493
496
|
" display_figure(fig)\n",
|
|
494
|
-
"
|
|
497
|
+
"\n",
|
|
495
498
|
" print(f\"\\n🎯 RECOMMENDED MODEL TYPE: {guidance.recommended_model_type.replace('_', ' ').title()}\")\n",
|
|
496
|
-
"
|
|
499
|
+
"\n",
|
|
497
500
|
" print(\"\\n📋 MODEL-SPECIFIC RECOMMENDATIONS:\")\n",
|
|
498
501
|
" for rec in guidance.model_recommendations:\n",
|
|
499
502
|
" print(f\" • {rec}\")\n",
|
|
500
|
-
"
|
|
503
|
+
"\n",
|
|
501
504
|
" print(\"\\n💡 GENERAL GUIDANCE:\")\n",
|
|
502
505
|
" for rec in guidance.recommendations:\n",
|
|
503
506
|
" print(f\" {rec}\")\n",
|
|
504
|
-
"
|
|
507
|
+
"\n",
|
|
505
508
|
" # Summary table\n",
|
|
506
509
|
" print(\"\\n\" + \"-\" * 70)\n",
|
|
507
510
|
" print(\"FEATURE BUDGET SUMMARY:\")\n",
|
|
@@ -512,12 +515,12 @@
|
|
|
512
515
|
" \"Current\": [current_features] * 3,\n",
|
|
513
516
|
" \"Status\": [\n",
|
|
514
517
|
" \"✅ OK\" if guidance.max_features_linear >= current_features else \"⚠️ Reduce\",\n",
|
|
515
|
-
" \"✅ OK\" if guidance.max_features_regularized >= current_features else \"⚠️ Reduce\"
|
|
518
|
+
" \"✅ OK\" if guidance.max_features_regularized >= current_features else \"⚠️ Reduce\",\n",
|
|
516
519
|
" \"✅ OK\" if guidance.max_features_tree >= current_features else \"⚠️ Reduce\"\n",
|
|
517
520
|
" ]\n",
|
|
518
521
|
" }\n",
|
|
519
522
|
" display(pd.DataFrame(summary_data))\n",
|
|
520
|
-
"
|
|
523
|
+
"\n",
|
|
521
524
|
" # Persist model type recommendation to registry\n",
|
|
522
525
|
" registry.add_bronze_model_type(\n",
|
|
523
526
|
" model_type=guidance.recommended_model_type,\n",
|
|
@@ -593,20 +596,20 @@
|
|
|
593
596
|
"if findings.target_column and categorical_cols and 'numeric_features' in dir():\n",
|
|
594
597
|
" # Analyze the first categorical column as potential segment\n",
|
|
595
598
|
" segment_col = categorical_cols[0]\n",
|
|
596
|
-
"
|
|
599
|
+
"\n",
|
|
597
600
|
" print(f\"\\n📊 Analyzing segments by: {segment_col}\")\n",
|
|
598
601
|
" print(f\" Features to evaluate: {len(numeric_features)}\")\n",
|
|
599
|
-
"
|
|
602
|
+
"\n",
|
|
600
603
|
" segment_result = capacity_analyzer.analyze_segment_capacity(\n",
|
|
601
604
|
" df,\n",
|
|
602
605
|
" feature_cols=numeric_features,\n",
|
|
603
606
|
" target_col=findings.target_column,\n",
|
|
604
607
|
" segment_col=segment_col,\n",
|
|
605
608
|
" )\n",
|
|
606
|
-
"
|
|
609
|
+
"\n",
|
|
607
610
|
" print(f\"\\n🎯 RECOMMENDED STRATEGY: {segment_result.recommended_strategy.replace('_', ' ').title()}\")\n",
|
|
608
611
|
" print(f\" Reason: {segment_result.strategy_reason}\")\n",
|
|
609
|
-
"
|
|
612
|
+
"\n",
|
|
610
613
|
" # Segment details table\n",
|
|
611
614
|
" segment_data = []\n",
|
|
612
615
|
" for seg_name, cap in segment_result.segment_capacities.items():\n",
|
|
@@ -618,14 +621,14 @@
|
|
|
618
621
|
" \"Max Features (EPV=10)\": cap.recommended_features_moderate,\n",
|
|
619
622
|
" \"Status\": cap.capacity_status.title()\n",
|
|
620
623
|
" })\n",
|
|
621
|
-
"
|
|
624
|
+
"\n",
|
|
622
625
|
" segment_df = pd.DataFrame(segment_data)\n",
|
|
623
626
|
" segment_df = segment_df.sort_values(\"Samples\", ascending=False)\n",
|
|
624
627
|
" display(segment_df)\n",
|
|
625
|
-
"
|
|
628
|
+
"\n",
|
|
626
629
|
" # Visualization\n",
|
|
627
630
|
" fig = go.Figure()\n",
|
|
628
|
-
"
|
|
631
|
+
"\n",
|
|
629
632
|
" max_events = 0\n",
|
|
630
633
|
" for seg_name, cap in segment_result.segment_capacities.items():\n",
|
|
631
634
|
" color = \"#2ecc71\" if cap.capacity_status == \"adequate\" else \"#f39c12\" if cap.capacity_status == \"limited\" else \"#e74c3c\"\n",
|
|
@@ -638,7 +641,7 @@
|
|
|
638
641
|
" textposition='outside'\n",
|
|
639
642
|
" ))\n",
|
|
640
643
|
" max_events = max(max_events, cap.minority_class_samples)\n",
|
|
641
|
-
"
|
|
644
|
+
"\n",
|
|
642
645
|
" # Add threshold line\n",
|
|
643
646
|
" threshold_events = len(numeric_features) * 10 # EPV=10 threshold\n",
|
|
644
647
|
" fig.add_hline(\n",
|
|
@@ -648,7 +651,7 @@
|
|
|
648
651
|
" annotation_text=f\"Min events for {len(numeric_features)} features (EPV=10)\",\n",
|
|
649
652
|
" annotation_position=\"right\"\n",
|
|
650
653
|
" )\n",
|
|
651
|
-
"
|
|
654
|
+
"\n",
|
|
652
655
|
" # Calculate y-axis range to fit labels\n",
|
|
653
656
|
" y_max = max(max_events, threshold_events)\n",
|
|
654
657
|
" fig.update_layout(\n",
|
|
@@ -661,16 +664,16 @@
|
|
|
661
664
|
" showlegend=False,\n",
|
|
662
665
|
" )\n",
|
|
663
666
|
" display_figure(fig)\n",
|
|
664
|
-
"
|
|
667
|
+
"\n",
|
|
665
668
|
" print(\"\\n📋 SEGMENT RECOMMENDATIONS:\")\n",
|
|
666
669
|
" for rec in segment_result.recommendations:\n",
|
|
667
670
|
" print(f\" {rec}\")\n",
|
|
668
|
-
"
|
|
671
|
+
"\n",
|
|
669
672
|
" if segment_result.viable_segments:\n",
|
|
670
673
|
" print(f\"\\n ✅ Viable for separate models: {', '.join(segment_result.viable_segments)}\")\n",
|
|
671
674
|
" if segment_result.insufficient_segments:\n",
|
|
672
675
|
" print(f\" ⚠️ Insufficient data: {', '.join(segment_result.insufficient_segments)}\")\n",
|
|
673
|
-
"
|
|
676
|
+
"\n",
|
|
674
677
|
" # Store in findings\n",
|
|
675
678
|
" findings.metadata[\"segment_capacity\"] = segment_result.to_dict()\n",
|
|
676
679
|
"else:\n",
|
|
@@ -724,38 +727,38 @@
|
|
|
724
727
|
" print(\"=\" * 70)\n",
|
|
725
728
|
" print(\"FEATURE CAPACITY ACTION ITEMS\")\n",
|
|
726
729
|
" print(\"=\" * 70)\n",
|
|
727
|
-
"
|
|
730
|
+
"\n",
|
|
728
731
|
" print(\"\\n📋 BASED ON YOUR DATA CAPACITY:\")\n",
|
|
729
|
-
"
|
|
732
|
+
"\n",
|
|
730
733
|
" # Action items based on capacity status\n",
|
|
731
734
|
" if capacity_result.capacity_status == \"adequate\":\n",
|
|
732
735
|
" print(\"\\n✅ ADEQUATE CAPACITY - You have room to add features\")\n",
|
|
733
736
|
" print(f\" • Current features: {capacity_result.total_features}\")\n",
|
|
734
737
|
" print(f\" • Can add up to: {capacity_result.recommended_features_moderate - capacity_result.total_features} more features (EPV=10)\")\n",
|
|
735
|
-
" print(
|
|
738
|
+
" print(\" • Consider: Creating derived features from datetime and categorical columns\")\n",
|
|
736
739
|
" elif capacity_result.capacity_status == \"limited\":\n",
|
|
737
740
|
" print(\"\\n⚠️ LIMITED CAPACITY - Be selective with new features\")\n",
|
|
738
741
|
" print(f\" • Current features: {capacity_result.total_features}\")\n",
|
|
739
742
|
" print(f\" • Recommended max: {capacity_result.recommended_features_moderate} features (EPV=10)\")\n",
|
|
740
743
|
" print(f\" • Action: Remove {max(0, capacity_result.total_features - capacity_result.recommended_features_moderate)} redundant features before adding new ones\")\n",
|
|
741
|
-
" print(
|
|
744
|
+
" print(\" • Consider: Using regularization (L1/Lasso) if keeping all features\")\n",
|
|
742
745
|
" else:\n",
|
|
743
746
|
" print(\"\\n🔴 INADEQUATE CAPACITY - Reduce features or get more data\")\n",
|
|
744
747
|
" print(f\" • Current features: {capacity_result.total_features}\")\n",
|
|
745
748
|
" print(f\" • Recommended max: {capacity_result.recommended_features_moderate} features (EPV=10)\")\n",
|
|
746
749
|
" print(f\" • CRITICAL: Reduce to {capacity_result.recommended_features_conservative} features for stable estimates\")\n",
|
|
747
|
-
" print(
|
|
748
|
-
"
|
|
750
|
+
" print(\" • Options: (1) Feature selection, (2) PCA, (3) Collect more data\")\n",
|
|
751
|
+
"\n",
|
|
749
752
|
" # Redundancy recommendations\n",
|
|
750
753
|
" if capacity_result.effective_features_result and capacity_result.effective_features_result.redundant_features:\n",
|
|
751
754
|
" redundant = capacity_result.effective_features_result.redundant_features\n",
|
|
752
|
-
" print(
|
|
753
|
-
" print(
|
|
755
|
+
" print(\"\\n🔄 REDUNDANT FEATURES TO CONSIDER REMOVING:\")\n",
|
|
756
|
+
" print(\" These features are highly correlated with others and add little new information:\")\n",
|
|
754
757
|
" for feat in redundant[:5]:\n",
|
|
755
758
|
" print(f\" • {feat}\")\n",
|
|
756
759
|
" if len(redundant) > 5:\n",
|
|
757
760
|
" print(f\" ... and {len(redundant) - 5} more\")\n",
|
|
758
|
-
"
|
|
761
|
+
"\n",
|
|
759
762
|
" # New feature budget\n",
|
|
760
763
|
" print(\"\\n💰 FEATURE BUDGET FOR NEW FEATURES:\")\n",
|
|
761
764
|
" remaining_budget = capacity_result.recommended_features_moderate - capacity_result.total_features\n",
|
|
@@ -767,7 +770,7 @@
|
|
|
767
770
|
" print(\" • Engagement composites (email_engagement_score)\")\n",
|
|
768
771
|
" else:\n",
|
|
769
772
|
" print(f\" ⚠️ At or over capacity. Remove {-remaining_budget} features before adding new ones.\")\n",
|
|
770
|
-
"
|
|
773
|
+
"\n",
|
|
771
774
|
" # Model selection summary\n",
|
|
772
775
|
" print(\"\\n🎯 RECOMMENDED MODELING APPROACH:\")\n",
|
|
773
776
|
" if capacity_result.complexity_guidance:\n",
|
|
@@ -778,7 +781,7 @@
|
|
|
778
781
|
" elif \"tree\" in capacity_result.complexity_guidance.recommended_model_type:\n",
|
|
779
782
|
" print(\" → Random Forest or XGBoost recommended\")\n",
|
|
780
783
|
" print(\" → Trees handle correlated features naturally\")\n",
|
|
781
|
-
"
|
|
784
|
+
"\n",
|
|
782
785
|
" print(\"\\n\" + \"=\" * 70)"
|
|
783
786
|
]
|
|
784
787
|
},
|
|
@@ -835,19 +838,19 @@
|
|
|
835
838
|
" selector = FeatureSelector(target_column=findings.target_column)\n",
|
|
836
839
|
" availability_recs = selector.get_availability_recommendations(findings.feature_availability)\n",
|
|
837
840
|
" unavailable_features = [rec.column for rec in availability_recs]\n",
|
|
838
|
-
"
|
|
841
|
+
"\n",
|
|
839
842
|
" print(f\"\\n⚠️ {len(availability_recs)} feature(s) have tracking changes:\\n\")\n",
|
|
840
|
-
"
|
|
843
|
+
"\n",
|
|
841
844
|
" for rec in availability_recs:\n",
|
|
842
845
|
" print(f\"📌 {rec.column}\")\n",
|
|
843
846
|
" print(f\" Issue: {rec.issue_type} | Coverage: {rec.coverage_pct:.0f}%\")\n",
|
|
844
847
|
" print(f\" Available: {rec.first_valid_date} → {rec.last_valid_date}\")\n",
|
|
845
|
-
" print(
|
|
848
|
+
" print(\"\\n Remediation options:\")\n",
|
|
846
849
|
" for opt in rec.options:\n",
|
|
847
850
|
" marker = \"→\" if opt.get(\"recommended\") else \" \"\n",
|
|
848
851
|
" print(f\" {marker} [{opt['type']}] {opt['description']}\")\n",
|
|
849
852
|
" print()\n",
|
|
850
|
-
"
|
|
853
|
+
"\n",
|
|
851
854
|
" print(\"-\" * 70)\n",
|
|
852
855
|
" print(\"RECOMMENDED ACTION: Remove unavailable features before modeling\")\n",
|
|
853
856
|
" print(\"-\" * 70)\n",
|
|
@@ -856,7 +859,7 @@
|
|
|
856
859
|
" print(\" • segment_by_cohort: Train separate models for different time periods\")\n",
|
|
857
860
|
" print(\" • add_indicator: Create availability flags, impute missing values\")\n",
|
|
858
861
|
" print(\" • filter_window: Restrict training data to feature's available period\")\n",
|
|
859
|
-
"
|
|
862
|
+
"\n",
|
|
860
863
|
" findings.metadata[\"unavailable_features\"] = unavailable_features\n",
|
|
861
864
|
" findings.metadata[\"availability_action\"] = \"exclude\"\n",
|
|
862
865
|
"else:\n",
|
|
@@ -978,13 +981,13 @@
|
|
|
978
981
|
"segmenter = CustomerSegmenter()\n",
|
|
979
982
|
"df_features = df.copy()\n",
|
|
980
983
|
"\n",
|
|
981
|
-
"datetime_cols = [name for name, col in findings.columns.items()
|
|
984
|
+
"datetime_cols = [name for name, col in findings.columns.items()\n",
|
|
982
985
|
" if col.inferred_type == ColumnType.DATETIME\n",
|
|
983
986
|
" and name not in TEMPORAL_METADATA_COLS]\n",
|
|
984
|
-
"binary_cols = [name for name, col in findings.columns.items()
|
|
987
|
+
"binary_cols = [name for name, col in findings.columns.items()\n",
|
|
985
988
|
" if col.inferred_type == ColumnType.BINARY\n",
|
|
986
989
|
" and name not in TEMPORAL_METADATA_COLS]\n",
|
|
987
|
-
"numeric_cols = [name for name, col in findings.columns.items()
|
|
990
|
+
"numeric_cols = [name for name, col in findings.columns.items()\n",
|
|
988
991
|
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]]\n",
|
|
989
992
|
"\n",
|
|
990
993
|
"for col in datetime_cols:\n",
|
|
@@ -1014,7 +1017,7 @@
|
|
|
1014
1017
|
"activity_cols = [c for c in datetime_cols if 'last' in c.lower() or 'recent' in c.lower()]\n",
|
|
1015
1018
|
"if activity_cols:\n",
|
|
1016
1019
|
" activity_col = activity_cols[0]\n",
|
|
1017
|
-
" df_features = segmenter.create_recency_features(df_features, last_activity_column=activity_col
|
|
1020
|
+
" df_features = segmenter.create_recency_features(df_features, last_activity_column=activity_col,\n",
|
|
1018
1021
|
" reference_date=reference_date, output_column='days_since_last_activity')\n",
|
|
1019
1022
|
" print(f\" ✓ days_since_last_activity from {activity_col}\")\n",
|
|
1020
1023
|
" registry.add_silver_derived(\n",
|
|
@@ -1032,7 +1035,7 @@
|
|
|
1032
1035
|
"\n",
|
|
1033
1036
|
"if open_rate_cols and click_rate_cols:\n",
|
|
1034
1037
|
" open_col, click_col = open_rate_cols[0], click_rate_cols[0]\n",
|
|
1035
|
-
" df_features = segmenter.create_engagement_score(df_features, open_rate_column=open_col
|
|
1038
|
+
" df_features = segmenter.create_engagement_score(df_features, open_rate_column=open_col,\n",
|
|
1036
1039
|
" click_rate_column=click_col, output_column='email_engagement_score')\n",
|
|
1037
1040
|
" print(f\" ✓ email_engagement_score from {open_col}, {click_col}\")\n",
|
|
1038
1041
|
" registry.add_silver_derived(\n",
|
|
@@ -1042,9 +1045,9 @@
|
|
|
1042
1045
|
" rationale=f\"Weighted engagement score from {open_col} and {click_col}\",\n",
|
|
1043
1046
|
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1044
1047
|
" )\n",
|
|
1045
|
-
"
|
|
1048
|
+
"\n",
|
|
1046
1049
|
" df_features['click_to_open_rate'] = np.where(df_features[open_col] > 0, df_features[click_col] / df_features[open_col], 0)\n",
|
|
1047
|
-
" print(
|
|
1050
|
+
" print(\" ✓ click_to_open_rate\")\n",
|
|
1048
1051
|
" registry.add_silver_ratio(\n",
|
|
1049
1052
|
" column=\"click_to_open_rate\",\n",
|
|
1050
1053
|
" numerator=click_col,\n",
|
|
@@ -1171,18 +1174,18 @@
|
|
|
1171
1174
|
"if 'customer_segment' in df_features.columns and findings.target_column and findings.target_column in df_features.columns:\n",
|
|
1172
1175
|
" target = findings.target_column\n",
|
|
1173
1176
|
" segment_retention = df_features.groupby('customer_segment')[target].mean() * 100\n",
|
|
1174
|
-
"
|
|
1177
|
+
"\n",
|
|
1175
1178
|
" max_rate = segment_retention.max()\n",
|
|
1176
1179
|
" fig = go.Figure(go.Bar(\n",
|
|
1177
1180
|
" x=segment_retention.index, y=segment_retention.values,\n",
|
|
1178
1181
|
" marker_color=['#2ca02c' if r > 70 else '#ffbb00' if r > 50 else '#d62728' for r in segment_retention.values],\n",
|
|
1179
1182
|
" text=[f'{r:.1f}%' for r in segment_retention.values], textposition='outside'))\n",
|
|
1180
1183
|
" fig.update_layout(\n",
|
|
1181
|
-
" title='Retention Rate by Customer Segment'
|
|
1182
|
-
" xaxis_title='Segment'
|
|
1184
|
+
" title='Retention Rate by Customer Segment',\n",
|
|
1185
|
+
" xaxis_title='Segment',\n",
|
|
1183
1186
|
" yaxis_title='Retention Rate (%)',\n",
|
|
1184
1187
|
" yaxis_range=[0, max_rate * 1.15], # Add 15% headroom for labels\n",
|
|
1185
|
-
" template='plotly_white'
|
|
1188
|
+
" template='plotly_white',\n",
|
|
1186
1189
|
" height=400,\n",
|
|
1187
1190
|
" )\n",
|
|
1188
1191
|
" display_figure(fig)\n",
|
|
@@ -1240,17 +1243,17 @@
|
|
|
1240
1243
|
"if numeric_cols:\n",
|
|
1241
1244
|
" print(\"Numeric Transformation Opportunities:\")\n",
|
|
1242
1245
|
" print(\"=\"*50)\n",
|
|
1243
|
-
"
|
|
1246
|
+
"\n",
|
|
1244
1247
|
" for col_name in numeric_cols:\n",
|
|
1245
1248
|
" col_info = findings.columns[col_name]\n",
|
|
1246
1249
|
" series = df[col_name].dropna()\n",
|
|
1247
1250
|
" skewness = series.skew()\n",
|
|
1248
|
-
"
|
|
1251
|
+
"\n",
|
|
1249
1252
|
" print(f\"\\n{col_name}:\")\n",
|
|
1250
1253
|
" print(f\" Skewness: {skewness:.2f}\")\n",
|
|
1251
|
-
"
|
|
1254
|
+
"\n",
|
|
1252
1255
|
" if abs(skewness) > 1:\n",
|
|
1253
|
-
" print(
|
|
1256
|
+
" print(\" Recommendation: Apply log transform (highly skewed)\")\n",
|
|
1254
1257
|
" registry.add_gold_transformation(\n",
|
|
1255
1258
|
" column=col_name,\n",
|
|
1256
1259
|
" transform=\"log\",\n",
|
|
@@ -1260,7 +1263,7 @@
|
|
|
1260
1263
|
" )\n",
|
|
1261
1264
|
" transform_count += 1\n",
|
|
1262
1265
|
" elif abs(skewness) > 0.5:\n",
|
|
1263
|
-
" print(
|
|
1266
|
+
" print(\" Recommendation: Consider sqrt transform (moderately skewed)\")\n",
|
|
1264
1267
|
" registry.add_gold_transformation(\n",
|
|
1265
1268
|
" column=col_name,\n",
|
|
1266
1269
|
" transform=\"sqrt\",\n",
|
|
@@ -1270,7 +1273,7 @@
|
|
|
1270
1273
|
" )\n",
|
|
1271
1274
|
" transform_count += 1\n",
|
|
1272
1275
|
" else:\n",
|
|
1273
|
-
" print(
|
|
1276
|
+
" print(\" Recommendation: Standard scaling sufficient\")\n",
|
|
1274
1277
|
" registry.add_gold_scaling(\n",
|
|
1275
1278
|
" column=col_name,\n",
|
|
1276
1279
|
" method=\"standard\",\n",
|
|
@@ -1278,10 +1281,10 @@
|
|
|
1278
1281
|
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1279
1282
|
" )\n",
|
|
1280
1283
|
" transform_count += 1\n",
|
|
1281
|
-
"
|
|
1284
|
+
"\n",
|
|
1282
1285
|
" if col_info.inferred_type == ColumnType.NUMERIC_CONTINUOUS:\n",
|
|
1283
1286
|
" print(f\" Binning: Consider creating bins for {col_name}_binned\")\n",
|
|
1284
|
-
"
|
|
1287
|
+
"\n",
|
|
1285
1288
|
" print(f\"\\n✅ Persisted {transform_count} transformation recommendations to registry\")"
|
|
1286
1289
|
]
|
|
1287
1290
|
},
|
|
@@ -1334,15 +1337,15 @@
|
|
|
1334
1337
|
"if categorical_cols:\n",
|
|
1335
1338
|
" print(\"Categorical Encoding Recommendations:\")\n",
|
|
1336
1339
|
" print(\"=\"*50)\n",
|
|
1337
|
-
"
|
|
1340
|
+
"\n",
|
|
1338
1341
|
" for col_name in categorical_cols:\n",
|
|
1339
1342
|
" col_info = findings.columns[col_name]\n",
|
|
1340
1343
|
" distinct = col_info.universal_metrics.get(\"distinct_count\", 0)\n",
|
|
1341
|
-
"
|
|
1344
|
+
"\n",
|
|
1342
1345
|
" print(f\"\\n{col_name}: ({distinct} unique values)\")\n",
|
|
1343
|
-
"
|
|
1346
|
+
"\n",
|
|
1344
1347
|
" if distinct <= 5:\n",
|
|
1345
|
-
" print(
|
|
1348
|
+
" print(\" Recommendation: One-hot encoding\")\n",
|
|
1346
1349
|
" registry.add_gold_encoding(\n",
|
|
1347
1350
|
" column=col_name,\n",
|
|
1348
1351
|
" method=\"onehot\",\n",
|
|
@@ -1351,7 +1354,7 @@
|
|
|
1351
1354
|
" )\n",
|
|
1352
1355
|
" encoding_count += 1\n",
|
|
1353
1356
|
" elif distinct <= 20:\n",
|
|
1354
|
-
" print(
|
|
1357
|
+
" print(\" Recommendation: Target encoding or one-hot with frequency threshold\")\n",
|
|
1355
1358
|
" registry.add_gold_encoding(\n",
|
|
1356
1359
|
" column=col_name,\n",
|
|
1357
1360
|
" method=\"target\",\n",
|
|
@@ -1360,7 +1363,7 @@
|
|
|
1360
1363
|
" )\n",
|
|
1361
1364
|
" encoding_count += 1\n",
|
|
1362
1365
|
" else:\n",
|
|
1363
|
-
" print(
|
|
1366
|
+
" print(\" Recommendation: Target encoding or embedding (high cardinality)\")\n",
|
|
1364
1367
|
" registry.add_gold_encoding(\n",
|
|
1365
1368
|
" column=col_name,\n",
|
|
1366
1369
|
" method=\"target\",\n",
|
|
@@ -1368,10 +1371,10 @@
|
|
|
1368
1371
|
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1369
1372
|
" )\n",
|
|
1370
1373
|
" encoding_count += 1\n",
|
|
1371
|
-
"
|
|
1374
|
+
"\n",
|
|
1372
1375
|
" if col_info.inferred_type == ColumnType.CATEGORICAL_ORDINAL:\n",
|
|
1373
|
-
" print(
|
|
1374
|
-
"
|
|
1376
|
+
" print(\" Note: Consider ordinal encoding to preserve order\")\n",
|
|
1377
|
+
"\n",
|
|
1375
1378
|
" print(f\"\\n✅ Persisted {encoding_count} encoding recommendations to registry\")"
|
|
1376
1379
|
]
|
|
1377
1380
|
},
|
|
@@ -1578,12 +1581,13 @@
|
|
|
1578
1581
|
"registry.save(RECOMMENDATIONS_PATH)\n",
|
|
1579
1582
|
"\n",
|
|
1580
1583
|
"print(f\"✅ Saved {len(registry.all_recommendations)} recommendations to {RECOMMENDATIONS_PATH}\")\n",
|
|
1581
|
-
"print(
|
|
1584
|
+
"print(\"\\nRecommendations by layer:\")\n",
|
|
1582
1585
|
"for layer in [\"bronze\", \"silver\", \"gold\"]:\n",
|
|
1583
1586
|
" recs = registry.get_by_layer(layer)\n",
|
|
1584
1587
|
" print(f\" {layer.upper()}: {len(recs)}\")\n",
|
|
1585
1588
|
"\n",
|
|
1586
1589
|
"from customer_retention.analysis.notebook_html_exporter import export_notebook_html\n",
|
|
1590
|
+
"\n",
|
|
1587
1591
|
"export_notebook_html(Path(\"06_feature_opportunities.ipynb\"), EXPERIMENTS_DIR / \"docs\")\n"
|
|
1588
1592
|
]
|
|
1589
1593
|
}
|