churnkit 0.76.1a1__py3-none-any.whl → 0.76.1a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +10 -5
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +6 -6
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +52 -46
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +68 -65
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +12 -27
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +216 -221
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +88 -81
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +111 -108
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +44 -38
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +89 -85
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +81 -80
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +83 -89
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +102 -98
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +32 -31
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +33 -29
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +6 -5
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +67 -63
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +38 -23
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +3 -1
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/METADATA +1 -1
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/RECORD +30 -30
- customer_retention/__init__.py +1 -1
- customer_retention/analysis/auto_explorer/explorer.py +2 -2
- customer_retention/analysis/notebook_progress.py +4 -1
- customer_retention/core/compat/__init__.py +10 -0
- customer_retention/integrations/databricks_init.py +13 -0
- customer_retention/stages/profiling/column_profiler.py +9 -2
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/WHEEL +0 -0
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/entry_points.txt +0 -0
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -82,21 +82,22 @@
|
|
|
82
82
|
"outputs": [],
|
|
83
83
|
"source": [
|
|
84
84
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
85
|
+
"\n",
|
|
85
86
|
"track_and_export_previous(\"04_relationship_analysis.ipynb\")\n",
|
|
86
87
|
"\n",
|
|
87
|
-
"from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationRegistry\n",
|
|
88
|
-
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
89
|
-
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
90
|
-
"from customer_retention.stages.profiling import (\n",
|
|
91
|
-
" RelationshipRecommender, RecommendationCategory\n",
|
|
92
|
-
")\n",
|
|
93
|
-
"import yaml\n",
|
|
94
|
-
"import pandas as pd\n",
|
|
95
88
|
"import numpy as np\n",
|
|
89
|
+
"import pandas as pd\n",
|
|
96
90
|
"import plotly.graph_objects as go\n",
|
|
97
|
-
"import
|
|
91
|
+
"import yaml\n",
|
|
98
92
|
"from plotly.subplots import make_subplots\n",
|
|
99
|
-
"
|
|
93
|
+
"\n",
|
|
94
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationRegistry\n",
|
|
95
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure\n",
|
|
96
|
+
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
97
|
+
"from customer_retention.core.config.experiments import (\n",
|
|
98
|
+
" FINDINGS_DIR,\n",
|
|
99
|
+
")\n",
|
|
100
|
+
"from customer_retention.stages.profiling import RecommendationCategory, RelationshipRecommender\n"
|
|
100
101
|
]
|
|
101
102
|
},
|
|
102
103
|
{
|
|
@@ -159,7 +160,7 @@
|
|
|
159
160
|
"findings = ExplorationFindings.load(FINDINGS_PATH)\n",
|
|
160
161
|
"\n",
|
|
161
162
|
"# Load data - handle aggregated vs standard paths\n",
|
|
162
|
-
"from customer_retention.stages.temporal import
|
|
163
|
+
"from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
|
|
163
164
|
"\n",
|
|
164
165
|
"# For aggregated data, load directly from the parquet source\n",
|
|
165
166
|
"if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
|
|
@@ -376,19 +377,19 @@
|
|
|
376
377
|
"# Feature Distributions by Retention Status\n",
|
|
377
378
|
"if findings.target_column and findings.target_column in df.columns:\n",
|
|
378
379
|
" target = findings.target_column\n",
|
|
379
|
-
"
|
|
380
|
+
"\n",
|
|
380
381
|
" feature_cols = [\n",
|
|
381
382
|
" name for name, col in findings.columns.items()\n",
|
|
382
383
|
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
383
384
|
" and name != target\n",
|
|
384
385
|
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
385
386
|
" ]\n",
|
|
386
|
-
"
|
|
387
|
+
"\n",
|
|
387
388
|
" if feature_cols:\n",
|
|
388
389
|
" print(\"=\" * 80)\n",
|
|
389
390
|
" print(f\"FEATURE DISTRIBUTIONS BY TARGET: {target}\")\n",
|
|
390
391
|
" print(\"=\" * 80)\n",
|
|
391
|
-
"
|
|
392
|
+
"\n",
|
|
392
393
|
" # Calculate summary statistics by target\n",
|
|
393
394
|
" summary_by_target = []\n",
|
|
394
395
|
" for col in feature_cols:\n",
|
|
@@ -403,16 +404,16 @@
|
|
|
403
404
|
" \"Median\": subset.median(),\n",
|
|
404
405
|
" \"Std\": subset.std()\n",
|
|
405
406
|
" })\n",
|
|
406
|
-
"
|
|
407
|
+
"\n",
|
|
407
408
|
" if summary_by_target:\n",
|
|
408
409
|
" summary_df = pd.DataFrame(summary_by_target)\n",
|
|
409
|
-
"
|
|
410
|
+
"\n",
|
|
410
411
|
" # Display summary table\n",
|
|
411
412
|
" print(\"\\n📊 Summary Statistics by Retention Status:\")\n",
|
|
412
413
|
" display_summary = summary_df.pivot(index=\"Feature\", columns=\"Group\", values=[\"Mean\", \"Median\"])\n",
|
|
413
414
|
" display_summary.columns = [f\"{stat} ({group})\" for stat, group in display_summary.columns]\n",
|
|
414
415
|
" display(display_summary.round(3))\n",
|
|
415
|
-
"
|
|
416
|
+
"\n",
|
|
416
417
|
" # Calculate effect size (Cohen's d) for each feature\n",
|
|
417
418
|
" print(\"\\n📈 Feature Importance Indicators (Effect Size - Cohen's d):\")\n",
|
|
418
419
|
" print(\"-\" * 70)\n",
|
|
@@ -420,16 +421,16 @@
|
|
|
420
421
|
" for col in feature_cols:\n",
|
|
421
422
|
" churned = df[df[target] == 0][col].dropna()\n",
|
|
422
423
|
" retained = df[df[target] == 1][col].dropna()\n",
|
|
423
|
-
"
|
|
424
|
+
"\n",
|
|
424
425
|
" if len(churned) > 0 and len(retained) > 0:\n",
|
|
425
426
|
" # Cohen's d\n",
|
|
426
|
-
" pooled_std = np.sqrt(((len(churned)-1)*churned.std()**2 + (len(retained)-1)*retained.std()**2)
|
|
427
|
+
" pooled_std = np.sqrt(((len(churned)-1)*churned.std()**2 + (len(retained)-1)*retained.std()**2) /\n",
|
|
427
428
|
" (len(churned) + len(retained) - 2))\n",
|
|
428
429
|
" if pooled_std > 0:\n",
|
|
429
430
|
" d = (retained.mean() - churned.mean()) / pooled_std\n",
|
|
430
431
|
" else:\n",
|
|
431
432
|
" d = 0\n",
|
|
432
|
-
"
|
|
433
|
+
"\n",
|
|
433
434
|
" # Interpret effect size\n",
|
|
434
435
|
" abs_d = abs(d)\n",
|
|
435
436
|
" if abs_d >= 0.8:\n",
|
|
@@ -444,17 +445,17 @@
|
|
|
444
445
|
" else:\n",
|
|
445
446
|
" interpretation = \"Negligible\"\n",
|
|
446
447
|
" emoji = \"⚪\"\n",
|
|
447
|
-
"
|
|
448
|
+
"\n",
|
|
448
449
|
" effect_sizes.append({\n",
|
|
449
450
|
" \"feature\": col,\n",
|
|
450
451
|
" \"cohens_d\": d,\n",
|
|
451
452
|
" \"abs_d\": abs_d,\n",
|
|
452
453
|
" \"interpretation\": interpretation\n",
|
|
453
454
|
" })\n",
|
|
454
|
-
"
|
|
455
|
+
"\n",
|
|
455
456
|
" direction = \"↑ Higher in retained\" if d > 0 else \"↓ Lower in retained\"\n",
|
|
456
457
|
" print(f\" {emoji} {col}: d={d:+.3f} ({interpretation}) {direction}\")\n",
|
|
457
|
-
"
|
|
458
|
+
"\n",
|
|
458
459
|
" # Sort by effect size for identifying important features\n",
|
|
459
460
|
" if effect_sizes:\n",
|
|
460
461
|
" effect_df = pd.DataFrame(effect_sizes).sort_values(\"abs_d\", ascending=False)\n",
|
|
@@ -535,27 +536,27 @@
|
|
|
535
536
|
"# Box Plots: Visual comparison of distributions\n",
|
|
536
537
|
"if findings.target_column and findings.target_column in df.columns:\n",
|
|
537
538
|
" target = findings.target_column\n",
|
|
538
|
-
"
|
|
539
|
+
"\n",
|
|
539
540
|
" feature_cols = [\n",
|
|
540
541
|
" name for name, col in findings.columns.items()\n",
|
|
541
542
|
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
542
543
|
" and name != target\n",
|
|
543
544
|
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
544
545
|
" ]\n",
|
|
545
|
-
"
|
|
546
|
+
"\n",
|
|
546
547
|
" if feature_cols:\n",
|
|
547
548
|
" # Create box plots - one subplot per feature for better control\n",
|
|
548
549
|
" n_features = min(len(feature_cols), 6)\n",
|
|
549
|
-
"
|
|
550
|
+
"\n",
|
|
550
551
|
" fig = make_subplots(\n",
|
|
551
552
|
" rows=1, cols=n_features,\n",
|
|
552
553
|
" subplot_titles=feature_cols[:n_features],\n",
|
|
553
554
|
" horizontal_spacing=0.05\n",
|
|
554
555
|
" )\n",
|
|
555
|
-
"
|
|
556
|
+
"\n",
|
|
556
557
|
" for i, col in enumerate(feature_cols[:n_features]):\n",
|
|
557
558
|
" col_num = i + 1\n",
|
|
558
|
-
"
|
|
559
|
+
"\n",
|
|
559
560
|
" # Retained (1) - Green\n",
|
|
560
561
|
" retained_data = df[df[target] == 1][col].dropna()\n",
|
|
561
562
|
" fig.add_trace(\n",
|
|
@@ -577,7 +578,7 @@
|
|
|
577
578
|
" ),\n",
|
|
578
579
|
" row=1, col=col_num\n",
|
|
579
580
|
" )\n",
|
|
580
|
-
"
|
|
581
|
+
"\n",
|
|
581
582
|
" # Churned (0) - Red\n",
|
|
582
583
|
" churned_data = df[df[target] == 0][col].dropna()\n",
|
|
583
584
|
" fig.add_trace(\n",
|
|
@@ -599,7 +600,7 @@
|
|
|
599
600
|
" ),\n",
|
|
600
601
|
" row=1, col=col_num\n",
|
|
601
602
|
" )\n",
|
|
602
|
-
"
|
|
603
|
+
"\n",
|
|
603
604
|
" fig.update_layout(\n",
|
|
604
605
|
" height=450,\n",
|
|
605
606
|
" title_text=\"Feature Distributions: Retained (Green) vs Churned (Red)\",\n",
|
|
@@ -610,12 +611,12 @@
|
|
|
610
611
|
" boxgap=0.3,\n",
|
|
611
612
|
" boxgroupgap=0.1\n",
|
|
612
613
|
" )\n",
|
|
613
|
-
"
|
|
614
|
+
"\n",
|
|
614
615
|
" # Center the boxes by removing x-axis tick labels (title is above each subplot)\n",
|
|
615
616
|
" fig.update_xaxes(showticklabels=False)\n",
|
|
616
|
-
"
|
|
617
|
+
"\n",
|
|
617
618
|
" display_figure(fig)\n",
|
|
618
|
-
"
|
|
619
|
+
"\n",
|
|
619
620
|
" # Print mean comparison\n",
|
|
620
621
|
" print(\"\\n📊 MEAN COMPARISON BY RETENTION STATUS:\")\n",
|
|
621
622
|
" print(\"-\" * 70)\n",
|
|
@@ -682,15 +683,15 @@
|
|
|
682
683
|
" and name != target\n",
|
|
683
684
|
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
684
685
|
" ]\n",
|
|
685
|
-
"
|
|
686
|
+
"\n",
|
|
686
687
|
" if feature_cols:\n",
|
|
687
688
|
" correlations = []\n",
|
|
688
689
|
" for col in feature_cols:\n",
|
|
689
690
|
" corr = df[[col, target]].corr().iloc[0, 1]\n",
|
|
690
691
|
" correlations.append({\"Feature\": col, \"Correlation\": corr})\n",
|
|
691
|
-
"
|
|
692
|
+
"\n",
|
|
692
693
|
" corr_df = pd.DataFrame(correlations).sort_values(\"Correlation\", key=abs, ascending=False)\n",
|
|
693
|
-
"
|
|
694
|
+
"\n",
|
|
694
695
|
" fig = charts.bar_chart(\n",
|
|
695
696
|
" corr_df[\"Feature\"].tolist(),\n",
|
|
696
697
|
" corr_df[\"Correlation\"].tolist(),\n",
|
|
@@ -757,23 +758,23 @@
|
|
|
757
758
|
"if findings.target_column:\n",
|
|
758
759
|
" target = findings.target_column\n",
|
|
759
760
|
" overall_retention = df[target].mean()\n",
|
|
760
|
-
"
|
|
761
|
+
"\n",
|
|
761
762
|
" categorical_cols = [\n",
|
|
762
763
|
" name for name, col in findings.columns.items()\n",
|
|
763
764
|
" if col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]\n",
|
|
764
765
|
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
765
766
|
" ]\n",
|
|
766
|
-
"
|
|
767
|
+
"\n",
|
|
767
768
|
" print(\"=\" * 80)\n",
|
|
768
769
|
" print(\"CATEGORICAL FEATURE ANALYSIS\")\n",
|
|
769
770
|
" print(\"=\" * 80)\n",
|
|
770
771
|
" print(f\"Overall retention rate: {overall_retention:.1%}\")\n",
|
|
771
|
-
"
|
|
772
|
+
"\n",
|
|
772
773
|
" if categorical_cols:\n",
|
|
773
774
|
" # Use framework analyzer for summary\n",
|
|
774
775
|
" cat_analyzer = CategoricalTargetAnalyzer(min_samples_per_category=10)\n",
|
|
775
776
|
" summary_df = cat_analyzer.analyze_multiple(df, categorical_cols, target)\n",
|
|
776
|
-
"
|
|
777
|
+
"\n",
|
|
777
778
|
" print(\"\\n📈 Categorical Feature Strength (Cramér's V):\")\n",
|
|
778
779
|
" print(\"-\" * 60)\n",
|
|
779
780
|
" for _, row in summary_df.iterrows():\n",
|
|
@@ -788,15 +789,15 @@
|
|
|
788
789
|
" emoji = \"🟢\"\n",
|
|
789
790
|
" sig = \"***\" if row[\"p_value\"] < 0.001 else \"**\" if row[\"p_value\"] < 0.01 else \"*\" if row[\"p_value\"] < 0.05 else \"\"\n",
|
|
790
791
|
" print(f\" {emoji} {row['feature']}: V={row['cramers_v']:.3f} ({strength}) {sig}\")\n",
|
|
791
|
-
"
|
|
792
|
+
"\n",
|
|
792
793
|
" # Detailed analysis for each categorical feature\n",
|
|
793
794
|
" for col_name in categorical_cols[:5]:\n",
|
|
794
795
|
" result = cat_analyzer.analyze(df, col_name, target)\n",
|
|
795
|
-
"
|
|
796
|
+
"\n",
|
|
796
797
|
" print(f\"\\n{'='*60}\")\n",
|
|
797
798
|
" print(f\"📊 {col_name.upper()}\")\n",
|
|
798
799
|
" print(\"=\"*60)\n",
|
|
799
|
-
"
|
|
800
|
+
"\n",
|
|
800
801
|
" # Display stats table\n",
|
|
801
802
|
" if len(result.category_stats) > 0:\n",
|
|
802
803
|
" display_stats = result.category_stats[['category', 'total_count', 'retention_rate', 'lift', 'pct_of_total']].copy()\n",
|
|
@@ -805,15 +806,15 @@
|
|
|
805
806
|
" display_stats['pct_of_total'] = display_stats['pct_of_total'].apply(lambda x: f\"{x:.1%}\")\n",
|
|
806
807
|
" display_stats.columns = [col_name, 'Count', 'Retention Rate', 'Lift', '% of Data']\n",
|
|
807
808
|
" display(display_stats)\n",
|
|
808
|
-
"
|
|
809
|
+
"\n",
|
|
809
810
|
" # Stacked bar chart\n",
|
|
810
811
|
" cat_stats = result.category_stats\n",
|
|
811
812
|
" categories = cat_stats['category'].tolist()\n",
|
|
812
813
|
" retained_counts = cat_stats['retained_count'].tolist()\n",
|
|
813
814
|
" churned_counts = cat_stats['churned_count'].tolist()\n",
|
|
814
|
-
"
|
|
815
|
+
"\n",
|
|
815
816
|
" fig = go.Figure()\n",
|
|
816
|
-
"
|
|
817
|
+
"\n",
|
|
817
818
|
" fig.add_trace(go.Bar(\n",
|
|
818
819
|
" name='Retained',\n",
|
|
819
820
|
" x=categories,\n",
|
|
@@ -823,7 +824,7 @@
|
|
|
823
824
|
" textposition='inside',\n",
|
|
824
825
|
" textfont=dict(color='white', size=12)\n",
|
|
825
826
|
" ))\n",
|
|
826
|
-
"
|
|
827
|
+
"\n",
|
|
827
828
|
" fig.add_trace(go.Bar(\n",
|
|
828
829
|
" name='Churned',\n",
|
|
829
830
|
" x=categories,\n",
|
|
@@ -833,7 +834,7 @@
|
|
|
833
834
|
" textposition='inside',\n",
|
|
834
835
|
" textfont=dict(color='white', size=12)\n",
|
|
835
836
|
" ))\n",
|
|
836
|
-
"
|
|
837
|
+
"\n",
|
|
837
838
|
" fig.update_layout(\n",
|
|
838
839
|
" barmode='stack',\n",
|
|
839
840
|
" title=f\"Retention by {col_name}\",\n",
|
|
@@ -844,10 +845,10 @@
|
|
|
844
845
|
" legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.02, xanchor=\"center\", x=0.5)\n",
|
|
845
846
|
" )\n",
|
|
846
847
|
" display_figure(fig)\n",
|
|
847
|
-
"
|
|
848
|
+
"\n",
|
|
848
849
|
" # Flag high-risk categories from framework result\n",
|
|
849
850
|
" if result.high_risk_categories:\n",
|
|
850
|
-
" print(
|
|
851
|
+
" print(\"\\n ⚠️ High-risk categories (lift < 0.9x):\")\n",
|
|
851
852
|
" for cat in result.high_risk_categories:\n",
|
|
852
853
|
" cat_row = cat_stats[cat_stats['category'] == cat].iloc[0]\n",
|
|
853
854
|
" print(f\" • {cat}: {cat_row['retention_rate']:.1%} retention ({cat_row['lift']:.2f}x lift)\")\n",
|
|
@@ -1033,33 +1034,33 @@
|
|
|
1033
1034
|
"if datetime_cols and findings.target_column:\n",
|
|
1034
1035
|
" target = findings.target_column\n",
|
|
1035
1036
|
" overall_retention = df[target].mean()\n",
|
|
1036
|
-
"
|
|
1037
|
+
"\n",
|
|
1037
1038
|
" # Use framework analyzer\n",
|
|
1038
1039
|
" temporal_analyzer = TemporalTargetAnalyzer(min_samples_per_period=10)\n",
|
|
1039
|
-
"
|
|
1040
|
+
"\n",
|
|
1040
1041
|
" for col_name in datetime_cols[:3]:\n",
|
|
1041
1042
|
" result = temporal_analyzer.analyze(df, col_name, target)\n",
|
|
1042
|
-
"
|
|
1043
|
+
"\n",
|
|
1043
1044
|
" print(f\"\\n{'='*60}\")\n",
|
|
1044
1045
|
" print(f\"📅 {col_name.upper()}\")\n",
|
|
1045
1046
|
" print(\"=\"*60)\n",
|
|
1046
|
-
"
|
|
1047
|
+
"\n",
|
|
1047
1048
|
" if result.n_valid_dates == 0:\n",
|
|
1048
1049
|
" print(\" No valid dates found\")\n",
|
|
1049
1050
|
" continue\n",
|
|
1050
|
-
"
|
|
1051
|
+
"\n",
|
|
1051
1052
|
" print(f\" Date range: {result.min_date} to {result.max_date}\")\n",
|
|
1052
1053
|
" print(f\" Valid dates: {result.n_valid_dates:,}\")\n",
|
|
1053
|
-
"
|
|
1054
|
+
"\n",
|
|
1054
1055
|
" # 1. Retention by Year (from framework result)\n",
|
|
1055
1056
|
" if len(result.yearly_stats) > 1:\n",
|
|
1056
1057
|
" print(f\"\\n 📊 Retention by Year: Trend is {result.yearly_trend}\")\n",
|
|
1057
|
-
"
|
|
1058
|
+
"\n",
|
|
1058
1059
|
" year_stats = result.yearly_stats\n",
|
|
1059
|
-
"
|
|
1060
|
+
"\n",
|
|
1060
1061
|
" fig = make_subplots(rows=1, cols=2, subplot_titles=[\"Retention Rate by Year\", \"Customer Count by Year\"],\n",
|
|
1061
1062
|
" column_widths=[0.6, 0.4])\n",
|
|
1062
|
-
"
|
|
1063
|
+
"\n",
|
|
1063
1064
|
" fig.add_trace(\n",
|
|
1064
1065
|
" go.Scatter(\n",
|
|
1065
1066
|
" x=year_stats['period'].astype(str),\n",
|
|
@@ -1073,7 +1074,7 @@
|
|
|
1073
1074
|
" )\n",
|
|
1074
1075
|
" fig.add_hline(y=overall_retention, line_dash=\"dash\", line_color=\"gray\",\n",
|
|
1075
1076
|
" annotation_text=f\"Overall: {overall_retention:.1%}\", row=1, col=1)\n",
|
|
1076
|
-
"
|
|
1077
|
+
"\n",
|
|
1077
1078
|
" fig.add_trace(\n",
|
|
1078
1079
|
" go.Bar(\n",
|
|
1079
1080
|
" x=year_stats['period'].astype(str),\n",
|
|
@@ -1083,19 +1084,19 @@
|
|
|
1083
1084
|
" ),\n",
|
|
1084
1085
|
" row=1, col=2\n",
|
|
1085
1086
|
" )\n",
|
|
1086
|
-
"
|
|
1087
|
+
"\n",
|
|
1087
1088
|
" fig.update_layout(height=350, template='plotly_white', showlegend=False)\n",
|
|
1088
1089
|
" fig.update_yaxes(tickformat='.0%', row=1, col=1)\n",
|
|
1089
1090
|
" display_figure(fig)\n",
|
|
1090
|
-
"
|
|
1091
|
+
"\n",
|
|
1091
1092
|
" # 2. Retention by Month (from framework result)\n",
|
|
1092
1093
|
" if len(result.monthly_stats) > 1:\n",
|
|
1093
|
-
" print(
|
|
1094
|
-
"
|
|
1094
|
+
" print(\"\\n 📊 Retention by Month (Seasonality):\")\n",
|
|
1095
|
+
"\n",
|
|
1095
1096
|
" month_stats = result.monthly_stats\n",
|
|
1096
|
-
" colors = ['rgba(46, 204, 113, 0.7)' if r >= overall_retention else 'rgba(231, 76, 60, 0.7)'
|
|
1097
|
+
" colors = ['rgba(46, 204, 113, 0.7)' if r >= overall_retention else 'rgba(231, 76, 60, 0.7)'\n",
|
|
1097
1098
|
" for r in month_stats['retention_rate']]\n",
|
|
1098
|
-
"
|
|
1099
|
+
"\n",
|
|
1099
1100
|
" fig = go.Figure()\n",
|
|
1100
1101
|
" fig.add_trace(go.Bar(\n",
|
|
1101
1102
|
" x=month_stats['month_name'],\n",
|
|
@@ -1106,7 +1107,7 @@
|
|
|
1106
1107
|
" ))\n",
|
|
1107
1108
|
" fig.add_hline(y=overall_retention, line_dash=\"dash\", line_color=\"gray\",\n",
|
|
1108
1109
|
" annotation_text=f\"Overall: {overall_retention:.1%}\")\n",
|
|
1109
|
-
"
|
|
1110
|
+
"\n",
|
|
1110
1111
|
" fig.update_layout(\n",
|
|
1111
1112
|
" title=f\"Monthly Retention Pattern ({col_name})\",\n",
|
|
1112
1113
|
" xaxis_title=\"Month\",\n",
|
|
@@ -1116,21 +1117,21 @@
|
|
|
1116
1117
|
" yaxis_tickformat='.0%'\n",
|
|
1117
1118
|
" )\n",
|
|
1118
1119
|
" display_figure(fig)\n",
|
|
1119
|
-
"
|
|
1120
|
+
"\n",
|
|
1120
1121
|
" # Seasonal insights from framework\n",
|
|
1121
1122
|
" if result.seasonal_spread > 0.05:\n",
|
|
1122
1123
|
" print(f\" 📈 Seasonal spread: {result.seasonal_spread:.1%}\")\n",
|
|
1123
1124
|
" print(f\" Best month: {result.best_month}\")\n",
|
|
1124
1125
|
" print(f\" Worst month: {result.worst_month}\")\n",
|
|
1125
|
-
"
|
|
1126
|
+
"\n",
|
|
1126
1127
|
" # 3. Retention by Day of Week (from framework result)\n",
|
|
1127
1128
|
" if len(result.dow_stats) > 1:\n",
|
|
1128
|
-
" print(
|
|
1129
|
-
"
|
|
1129
|
+
" print(\"\\n 📊 Retention by Day of Week:\")\n",
|
|
1130
|
+
"\n",
|
|
1130
1131
|
" dow_stats = result.dow_stats\n",
|
|
1131
|
-
" colors = ['rgba(46, 204, 113, 0.7)' if r >= overall_retention else 'rgba(231, 76, 60, 0.7)'
|
|
1132
|
+
" colors = ['rgba(46, 204, 113, 0.7)' if r >= overall_retention else 'rgba(231, 76, 60, 0.7)'\n",
|
|
1132
1133
|
" for r in dow_stats['retention_rate']]\n",
|
|
1133
|
-
"
|
|
1134
|
+
"\n",
|
|
1134
1135
|
" fig = go.Figure()\n",
|
|
1135
1136
|
" fig.add_trace(go.Bar(\n",
|
|
1136
1137
|
" x=dow_stats['day_name'],\n",
|
|
@@ -1140,7 +1141,7 @@
|
|
|
1140
1141
|
" textposition='outside'\n",
|
|
1141
1142
|
" ))\n",
|
|
1142
1143
|
" fig.add_hline(y=overall_retention, line_dash=\"dash\", line_color=\"gray\")\n",
|
|
1143
|
-
"
|
|
1144
|
+
"\n",
|
|
1144
1145
|
" fig.update_layout(\n",
|
|
1145
1146
|
" title=f\"Day of Week Pattern ({col_name})\",\n",
|
|
1146
1147
|
" xaxis_title=\"Day of Week\",\n",
|
|
@@ -1371,7 +1372,7 @@
|
|
|
1371
1372
|
" strong_df[\"correlation\"] = strong_df[\"correlation\"].apply(lambda x: f\"{x:+.3f}\")\n",
|
|
1372
1373
|
" strong_df = strong_df.sort_values(\"effect_size\", key=lambda x: x.str.replace(\"+\", \"\").astype(float).abs(), ascending=False)\n",
|
|
1373
1374
|
" display(strong_df)\n",
|
|
1374
|
-
"
|
|
1375
|
+
"\n",
|
|
1375
1376
|
" print(\"\\n 💡 These features show strong discrimination between retained/churned customers.\")\n",
|
|
1376
1377
|
" print(\" → Ensure they're included in your model\")\n",
|
|
1377
1378
|
" print(\" → Check for data quality issues that could inflate their importance\")\n",
|
|
@@ -1478,7 +1479,7 @@
|
|
|
1478
1479
|
" risk_df[\"retention_rate\"] = risk_df[\"retention_rate\"].apply(lambda x: f\"{x:.1%}\")\n",
|
|
1479
1480
|
" risk_df[\"lift\"] = risk_df[\"lift\"].apply(lambda x: f\"{x:.2f}x\")\n",
|
|
1480
1481
|
" display(risk_df[[\"feature\", \"segment\", \"count\", \"retention_rate\", \"lift\"]])\n",
|
|
1481
|
-
"
|
|
1482
|
+
"\n",
|
|
1482
1483
|
" print(\"\\n 💡 These segments have below-average retention.\")\n",
|
|
1483
1484
|
" print(\" → Ensure they're adequately represented in both train and test sets\")\n",
|
|
1484
1485
|
" print(\" → Consider oversampling or class weights in modeling\")\n",
|
|
@@ -1674,7 +1675,7 @@
|
|
|
1674
1675
|
" print(\"POTENTIAL INTERACTION FEATURES:\")\n",
|
|
1675
1676
|
" strong_features = [p[\"feature\"] for p in analysis_summary.strong_predictors[:5]]\n",
|
|
1676
1677
|
" if len(strong_features) >= 2:\n",
|
|
1677
|
-
" print(
|
|
1678
|
+
" print(\"\\n Based on strong predictors, consider interactions between:\")\n",
|
|
1678
1679
|
" for i, f1 in enumerate(strong_features[:3]):\n",
|
|
1679
1680
|
" for f2 in strong_features[i+1:4]:\n",
|
|
1680
1681
|
" print(f\" • {f1} × {f2}\")\n",
|
|
@@ -1733,12 +1734,12 @@
|
|
|
1733
1734
|
"\n",
|
|
1734
1735
|
"if all_recs_data:\n",
|
|
1735
1736
|
" recs_df = pd.DataFrame(all_recs_data)\n",
|
|
1736
|
-
"
|
|
1737
|
+
"\n",
|
|
1737
1738
|
" # Sort by priority\n",
|
|
1738
1739
|
" priority_order = {\"HIGH\": 0, \"MEDIUM\": 1, \"LOW\": 2}\n",
|
|
1739
1740
|
" recs_df[\"_sort\"] = recs_df[\"Priority\"].map(priority_order)\n",
|
|
1740
1741
|
" recs_df = recs_df.sort_values(\"_sort\").drop(\"_sort\", axis=1)\n",
|
|
1741
|
-
"
|
|
1742
|
+
"\n",
|
|
1742
1743
|
" print(\"=\" * 80)\n",
|
|
1743
1744
|
" print(\"ALL RECOMMENDATIONS SUMMARY\")\n",
|
|
1744
1745
|
" print(\"=\" * 80)\n",
|
|
@@ -1746,7 +1747,7 @@
|
|
|
1746
1747
|
" print(f\" 🔴 High priority: {len(recs_df[recs_df['Priority'] == 'HIGH'])}\")\n",
|
|
1747
1748
|
" print(f\" 🟡 Medium priority: {len(recs_df[recs_df['Priority'] == 'MEDIUM'])}\")\n",
|
|
1748
1749
|
" print(f\" 🟢 Low priority: {len(recs_df[recs_df['Priority'] == 'LOW'])}\")\n",
|
|
1749
|
-
"
|
|
1750
|
+
"\n",
|
|
1750
1751
|
" display(recs_df)\n",
|
|
1751
1752
|
"\n",
|
|
1752
1753
|
"# Save updated findings and recommendations registry\n",
|