churnkit 0.76.1a1__py3-none-any.whl → 0.76.1a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +10 -5
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +6 -6
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +52 -46
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +68 -65
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +12 -27
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +216 -221
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +88 -81
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +111 -108
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +44 -38
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +89 -85
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +81 -80
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +83 -89
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +102 -98
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +32 -31
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +33 -29
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +6 -5
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +67 -63
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +38 -23
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +3 -1
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/METADATA +1 -1
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/RECORD +30 -30
- customer_retention/__init__.py +1 -1
- customer_retention/analysis/auto_explorer/explorer.py +2 -2
- customer_retention/analysis/notebook_progress.py +4 -1
- customer_retention/core/compat/__init__.py +10 -0
- customer_retention/integrations/databricks_init.py +13 -0
- customer_retention/stages/profiling/column_profiler.py +9 -2
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/WHEEL +0 -0
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/entry_points.txt +0 -0
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -79,25 +79,24 @@
|
|
|
79
79
|
"outputs": [],
|
|
80
80
|
"source": [
|
|
81
81
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
82
|
+
"\n",
|
|
82
83
|
"track_and_export_previous(\"01c_temporal_patterns.ipynb\")\n",
|
|
83
84
|
"\n",
|
|
84
|
-
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
85
|
-
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
86
|
-
"from customer_retention.core.config.column_config import ColumnType, DatasetGranularity\n",
|
|
87
|
-
"from customer_retention.stages.profiling import (\n",
|
|
88
|
-
" TemporalPatternAnalyzer, TemporalPatternAnalysis,\n",
|
|
89
|
-
" TrendResult, TrendDirection, SeasonalityResult, RecencyResult,\n",
|
|
90
|
-
" TemporalFeatureAnalyzer, VelocityResult, MomentumResult,\n",
|
|
91
|
-
" LagCorrelationResult, PredictivePowerResult, FeatureRecommendation,\n",
|
|
92
|
-
" CategoricalTargetAnalyzer\n",
|
|
93
|
-
")\n",
|
|
94
|
-
"import pandas as pd\n",
|
|
95
85
|
"import numpy as np\n",
|
|
86
|
+
"import pandas as pd\n",
|
|
96
87
|
"import plotly.graph_objects as go\n",
|
|
97
|
-
"import plotly.express as px\n",
|
|
98
88
|
"from plotly.subplots import make_subplots\n",
|
|
99
|
-
"
|
|
100
|
-
"from customer_retention.
|
|
89
|
+
"\n",
|
|
90
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
91
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure\n",
|
|
92
|
+
"from customer_retention.core.config.experiments import (\n",
|
|
93
|
+
" FINDINGS_DIR,\n",
|
|
94
|
+
")\n",
|
|
95
|
+
"from customer_retention.stages.profiling import (\n",
|
|
96
|
+
" TemporalFeatureAnalyzer,\n",
|
|
97
|
+
" TemporalPatternAnalyzer,\n",
|
|
98
|
+
" TrendDirection,\n",
|
|
99
|
+
")\n"
|
|
101
100
|
]
|
|
102
101
|
},
|
|
103
102
|
{
|
|
@@ -123,7 +122,6 @@
|
|
|
123
122
|
"outputs": [],
|
|
124
123
|
"source": [
|
|
125
124
|
"# === CONFIGURATION ===\n",
|
|
126
|
-
"from pathlib import Path\n",
|
|
127
125
|
"\n",
|
|
128
126
|
"# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
|
|
129
127
|
"\n",
|
|
@@ -195,7 +193,7 @@
|
|
|
195
193
|
},
|
|
196
194
|
"outputs": [],
|
|
197
195
|
"source": [
|
|
198
|
-
"from customer_retention.stages.temporal import
|
|
196
|
+
"from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
|
|
199
197
|
"\n",
|
|
200
198
|
"# Load source data (prefers snapshots over raw files)\n",
|
|
201
199
|
"df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
|
|
@@ -264,9 +262,7 @@
|
|
|
264
262
|
"TARGET_AGGREGATION = \"max\" # Options: \"max\", \"mean\", \"sum\", \"last\", \"first\"\n",
|
|
265
263
|
"\n",
|
|
266
264
|
"# Detect and analyze target\n",
|
|
267
|
-
"from customer_retention.stages.profiling import
|
|
268
|
-
" TargetLevelAnalyzer, TargetColumnDetector, AggregationMethod\n",
|
|
269
|
-
")\n",
|
|
265
|
+
"from customer_retention.stages.profiling import AggregationMethod, TargetColumnDetector, TargetLevelAnalyzer\n",
|
|
270
266
|
"\n",
|
|
271
267
|
"detector = TargetColumnDetector()\n",
|
|
272
268
|
"target_col, method = detector.detect(findings, df, override=TARGET_COLUMN_OVERRIDE)\n",
|
|
@@ -278,14 +274,14 @@
|
|
|
278
274
|
" agg_method = AggregationMethod(TARGET_AGGREGATION)\n",
|
|
279
275
|
" df, result = analyzer.aggregate_to_entity(df, TARGET_COLUMN, ENTITY_COLUMN, TIME_COLUMN, agg_method)\n",
|
|
280
276
|
" analyzer.print_analysis(result)\n",
|
|
281
|
-
"
|
|
277
|
+
"\n",
|
|
282
278
|
" # Update TARGET_COLUMN to entity-level version if aggregated\n",
|
|
283
279
|
" if result.entity_target_column:\n",
|
|
284
280
|
" ORIGINAL_TARGET = TARGET_COLUMN\n",
|
|
285
281
|
" TARGET_COLUMN = result.entity_target_column\n",
|
|
286
282
|
"\n",
|
|
287
283
|
"print(\"\\n\" + \"─\"*70)\n",
|
|
288
|
-
"print(
|
|
284
|
+
"print(\"Final configuration:\")\n",
|
|
289
285
|
"print(f\" ENTITY_COLUMN: {ENTITY_COLUMN}\")\n",
|
|
290
286
|
"print(f\" TIME_COLUMN: {TIME_COLUMN}\")\n",
|
|
291
287
|
"print(f\" TARGET_COLUMN: {TARGET_COLUMN}\")\n",
|
|
@@ -363,10 +359,10 @@
|
|
|
363
359
|
"print(\"=\"*70)\n",
|
|
364
360
|
"print(f\"\\nSource: {'Manual override' if WINDOW_OVERRIDE else '01a findings (recommended)'}\")\n",
|
|
365
361
|
"print(f\"\\nWindows: {pattern_config.aggregation_windows}\")\n",
|
|
366
|
-
"print(
|
|
362
|
+
"print(\"\\nDerived settings used throughout this notebook:\")\n",
|
|
367
363
|
"print(f\" • Velocity/Rolling window: {pattern_config.velocity_window_days} days\")\n",
|
|
368
364
|
"print(f\" • Momentum pairs: {pattern_config.get_momentum_pairs()}\")\n",
|
|
369
|
-
"print(
|
|
365
|
+
"print(\"\\n💡 To override, set WINDOW_OVERRIDE = ['7d', '30d', '90d'] above and re-run\")\n"
|
|
370
366
|
]
|
|
371
367
|
},
|
|
372
368
|
{
|
|
@@ -415,7 +411,7 @@
|
|
|
415
411
|
"numeric_cols = [c for c in numeric_cols if c not in [ENTITY_COLUMN] and c not in TEMPORAL_METADATA_COLS]\n",
|
|
416
412
|
"\n",
|
|
417
413
|
"# Separate target columns from feature columns\n",
|
|
418
|
-
"target_cols = [c for c in numeric_cols if c.lower() in ['target', 'target_entity', 'label']
|
|
414
|
+
"target_cols = [c for c in numeric_cols if c.lower() in ['target', 'target_entity', 'label']\n",
|
|
419
415
|
" or (TARGET_COLUMN and c.lower() == TARGET_COLUMN.lower())]\n",
|
|
420
416
|
"feature_cols = [c for c in numeric_cols if c not in target_cols]\n",
|
|
421
417
|
"\n",
|
|
@@ -610,7 +606,7 @@
|
|
|
610
606
|
" if rec.features:\n",
|
|
611
607
|
" print(f\" Features: {', '.join(rec.features)}\")\n",
|
|
612
608
|
"\n",
|
|
613
|
-
"TREND_RECOMMENDATIONS = [{\"action\": r.action, \"priority\": r.priority, \"reason\": r.reason
|
|
609
|
+
"TREND_RECOMMENDATIONS = [{\"action\": r.action, \"priority\": r.priority, \"reason\": r.reason,\n",
|
|
614
610
|
" \"features\": r.features} for r in trend_recs]"
|
|
615
611
|
]
|
|
616
612
|
},
|
|
@@ -686,7 +682,6 @@
|
|
|
686
682
|
"outputs": [],
|
|
687
683
|
"source": [
|
|
688
684
|
"# Seasonality Analysis - Temporal Pattern Grid + Autocorrelation\n",
|
|
689
|
-
"from plotly.subplots import make_subplots\n",
|
|
690
685
|
"\n",
|
|
691
686
|
"# Prepare temporal columns\n",
|
|
692
687
|
"daily_data[\"day_of_week\"] = daily_data[TIME_COLUMN].dt.day_name()\n",
|
|
@@ -748,17 +743,17 @@
|
|
|
748
743
|
"\n",
|
|
749
744
|
"# Variation analysis\n",
|
|
750
745
|
"def calc_var(stats): return (stats[\"mean\"].max() - stats[\"mean\"].min()) / overall_mean * 100 if len(stats) > 1 else 0\n",
|
|
751
|
-
"variations = {\"day_of_week\": calc_var(dow_stats), \"month\": calc_var(monthly_stats)
|
|
746
|
+
"variations = {\"day_of_week\": calc_var(dow_stats), \"month\": calc_var(monthly_stats),\n",
|
|
752
747
|
" \"quarter\": calc_var(quarterly_stats), \"year\": calc_var(yearly_stats)}\n",
|
|
753
748
|
"\n",
|
|
754
|
-
"print(
|
|
749
|
+
"print(\"\\n📈 Pattern Variation (% from mean):\")\n",
|
|
755
750
|
"print(f\" Day of Week: {variations['day_of_week']:.1f}%\")\n",
|
|
756
751
|
"print(f\" Monthly: {variations['month']:.1f}%\")\n",
|
|
757
752
|
"print(f\" Quarterly: {variations['quarter']:.1f}%\")\n",
|
|
758
753
|
"print(f\" Yearly: {variations['year']:.1f}%\")\n",
|
|
759
754
|
"\n",
|
|
760
755
|
"# Autocorrelation seasonality\n",
|
|
761
|
-
"print(
|
|
756
|
+
"print(\"\\n🔁 Autocorrelation Seasonality (threshold > 0.3):\")\n",
|
|
762
757
|
"if seasonality_results:\n",
|
|
763
758
|
" for sr in seasonality_results:\n",
|
|
764
759
|
" strength = \"Strong\" if sr.strength > 0.5 else \"Moderate\"\n",
|
|
@@ -771,7 +766,7 @@
|
|
|
771
766
|
"SEASONALITY_RECOMMENDATIONS = []\n",
|
|
772
767
|
"for pattern, var_pct in variations.items():\n",
|
|
773
768
|
" priority = \"high\" if var_pct > 20 else \"medium\" if var_pct > 10 else \"low\"\n",
|
|
774
|
-
"
|
|
769
|
+
"\n",
|
|
775
770
|
" if pattern == \"day_of_week\" and var_pct > 10:\n",
|
|
776
771
|
" SEASONALITY_RECOMMENDATIONS.append({\"pattern\": pattern, \"variation\": var_pct, \"priority\": priority,\n",
|
|
777
772
|
" \"features\": [\"dow_sin\", \"dow_cos\", \"is_weekend\"], \"reason\": f\"{var_pct:.1f}% variation - add cyclical encoding\"})\n",
|
|
@@ -794,8 +789,8 @@
|
|
|
794
789
|
"# For autocorrelation-detected patterns\n",
|
|
795
790
|
"for sr in seasonality_results:\n",
|
|
796
791
|
" if sr.period in [7, 14, 21, 30] and sr.strength > 0.3:\n",
|
|
797
|
-
" SEASONALITY_RECOMMENDATIONS.append({\"pattern\": f\"{sr.period}d_cycle\", \"variation\": sr.strength * 100
|
|
798
|
-
" \"priority\": \"medium\", \"features\": [f\"lag_{sr.period}d_ratio\"]
|
|
792
|
+
" SEASONALITY_RECOMMENDATIONS.append({\"pattern\": f\"{sr.period}d_cycle\", \"variation\": sr.strength * 100,\n",
|
|
793
|
+
" \"priority\": \"medium\", \"features\": [f\"lag_{sr.period}d_ratio\"],\n",
|
|
799
794
|
" \"reason\": f\"Autocorrelation {sr.strength:.2f} at {sr.period}d - add lag ratio feature\"})\n",
|
|
800
795
|
"\n",
|
|
801
796
|
"print(\"\\n\" + \"─\"*60)\n",
|
|
@@ -881,22 +876,22 @@
|
|
|
881
876
|
" first_events = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].min().reset_index()\n",
|
|
882
877
|
" first_events.columns = [ENTITY_COLUMN, \"first_event\"]\n",
|
|
883
878
|
" cohort_dist = analyze_cohort_distribution(first_events, \"first_event\")\n",
|
|
884
|
-
"
|
|
879
|
+
"\n",
|
|
885
880
|
" cohort_result = analyzer.analyze_cohorts(\n",
|
|
886
881
|
" df, entity_column=ENTITY_COLUMN, cohort_column=TIME_COLUMN,\n",
|
|
887
882
|
" target_column=TARGET_COLUMN, period=\"M\"\n",
|
|
888
883
|
" )\n",
|
|
889
|
-
"
|
|
884
|
+
"\n",
|
|
890
885
|
" print(\"👥 COHORT ANALYSIS\")\n",
|
|
891
886
|
" print(\"=\"*50)\n",
|
|
892
887
|
" print(f\"\\nEntity Onboarding: {cohort_dist.dominant_pct:.0f}% in {cohort_dist.dominant_year}, {cohort_dist.num_years} years total\")\n",
|
|
893
|
-
"
|
|
888
|
+
"\n",
|
|
894
889
|
" if len(cohort_result) > 0:\n",
|
|
895
890
|
" cohort_sorted = cohort_result.sort_values(\"cohort\")\n",
|
|
896
891
|
" has_retention = \"retention_rate\" in cohort_sorted.columns\n",
|
|
897
|
-
"
|
|
892
|
+
"\n",
|
|
898
893
|
" fig = make_subplots(specs=[[{\"secondary_y\": True}]]) if has_retention else go.Figure()\n",
|
|
899
|
-
"
|
|
894
|
+
"\n",
|
|
900
895
|
" fig.add_trace(go.Bar(\n",
|
|
901
896
|
" x=cohort_sorted[\"cohort\"].astype(str), y=cohort_sorted[\"entity_count\"],\n",
|
|
902
897
|
" name=\"Entities (sign-up cohort)\", marker_color=\"steelblue\", opacity=0.7\n",
|
|
@@ -904,7 +899,7 @@
|
|
|
904
899
|
" x=cohort_sorted[\"cohort\"].astype(str), y=cohort_sorted[\"entity_count\"],\n",
|
|
905
900
|
" name=\"Entities (sign-up cohort)\", marker_color=\"steelblue\", opacity=0.7\n",
|
|
906
901
|
" ))\n",
|
|
907
|
-
"
|
|
902
|
+
"\n",
|
|
908
903
|
" if has_retention:\n",
|
|
909
904
|
" fig.add_trace(go.Scatter(\n",
|
|
910
905
|
" x=cohort_sorted[\"cohort\"].astype(str), y=cohort_sorted[\"retention_rate\"] * 100,\n",
|
|
@@ -912,7 +907,7 @@
|
|
|
912
907
|
" line=dict(color=\"coral\", width=3), marker=dict(size=8)\n",
|
|
913
908
|
" ), secondary_y=True)\n",
|
|
914
909
|
" fig.update_yaxes(title_text=\"Retention Rate %\", secondary_y=True)\n",
|
|
915
|
-
"
|
|
910
|
+
"\n",
|
|
916
911
|
" fig.update_layout(\n",
|
|
917
912
|
" title=\"Cohort Analysis: Entity Count by Sign-up Month (cohort = first event period)\",\n",
|
|
918
913
|
" xaxis_title=\"Cohort (First Event Month)\", template=\"plotly_white\", height=400\n",
|
|
@@ -948,21 +943,21 @@
|
|
|
948
943
|
" retention_var = None\n",
|
|
949
944
|
" if \"retention_rate\" in cohort_result.columns:\n",
|
|
950
945
|
" retention_var = cohort_result[\"retention_rate\"].max() - cohort_result[\"retention_rate\"].min()\n",
|
|
951
|
-
"
|
|
946
|
+
"\n",
|
|
952
947
|
" cohort_recs = generate_cohort_recommendations(cohort_dist, retention_variation=retention_var)\n",
|
|
953
|
-
"
|
|
948
|
+
"\n",
|
|
954
949
|
" print(\"📊 COHORT DETAILS\")\n",
|
|
955
950
|
" print(\"=\"*50)\n",
|
|
956
|
-
" print(
|
|
951
|
+
" print(\"\\nEntity Onboarding Distribution by Year:\")\n",
|
|
957
952
|
" print(\"─\" * 40)\n",
|
|
958
953
|
" for year, count in sorted(cohort_dist.year_counts.items()):\n",
|
|
959
954
|
" pct = count / cohort_dist.total_entities * 100\n",
|
|
960
955
|
" bar = \"█\" * int(pct / 3)\n",
|
|
961
956
|
" print(f\" {year}: {count:>5,} entities ({pct:>5.1f}%) {bar}\")\n",
|
|
962
|
-
"
|
|
957
|
+
"\n",
|
|
963
958
|
" print(f\"\\n Total entities: {cohort_dist.total_entities:,}\")\n",
|
|
964
959
|
" print(f\" Data spans: {df[TIME_COLUMN].min().date()} to {df[TIME_COLUMN].max().date()}\")\n",
|
|
965
|
-
"
|
|
960
|
+
"\n",
|
|
966
961
|
" print(\"\\n📋 RECOMMENDATIONS:\")\n",
|
|
967
962
|
" for rec in cohort_recs:\n",
|
|
968
963
|
" priority_icon = {\"high\": \"🔴\", \"medium\": \"🟡\", \"low\": \"🟢\"}.get(rec.priority, \"⚪\")\n",
|
|
@@ -972,9 +967,9 @@
|
|
|
972
967
|
" print(f\" Features: {', '.join(rec.features)}\")\n",
|
|
973
968
|
" if rec.insight:\n",
|
|
974
969
|
" print(f\" 💡 {rec.insight}\")\n",
|
|
975
|
-
"
|
|
970
|
+
"\n",
|
|
976
971
|
" COHORT_RECOMMENDATIONS = [{\"action\": r.action, \"priority\": r.priority, \"reason\": r.reason,\n",
|
|
977
|
-
" \"features\": getattr(r, 'features', [])
|
|
972
|
+
" \"features\": getattr(r, 'features', []),\n",
|
|
978
973
|
" \"insight\": getattr(r, 'insight', None)} for r in cohort_recs]"
|
|
979
974
|
]
|
|
980
975
|
},
|
|
@@ -1031,7 +1026,7 @@
|
|
|
1031
1026
|
"source": [
|
|
1032
1027
|
"# Correlation matrix for numeric event attributes\n",
|
|
1033
1028
|
"# Define analysis columns - exclude entity, time, target, and temporal metadata\n",
|
|
1034
|
-
"numeric_event_cols = [c for c in df.select_dtypes(include=[np.number]).columns
|
|
1029
|
+
"numeric_event_cols = [c for c in df.select_dtypes(include=[np.number]).columns\n",
|
|
1035
1030
|
" if c not in [ENTITY_COLUMN, TIME_COLUMN, TARGET_COLUMN]\n",
|
|
1036
1031
|
" and c not in TEMPORAL_METADATA_COLS\n",
|
|
1037
1032
|
" and 'target' not in c.lower()]\n",
|
|
@@ -1049,7 +1044,7 @@
|
|
|
1049
1044
|
" title=\"Feature Correlation Matrix (Event-Level)\"\n",
|
|
1050
1045
|
" )\n",
|
|
1051
1046
|
" display_figure(fig)\n",
|
|
1052
|
-
"
|
|
1047
|
+
"\n",
|
|
1053
1048
|
" # High correlation pairs\n",
|
|
1054
1049
|
" high_corr = []\n",
|
|
1055
1050
|
" for i in range(len(numeric_event_cols)):\n",
|
|
@@ -1057,7 +1052,7 @@
|
|
|
1057
1052
|
" corr_val = corr_matrix.iloc[i, j]\n",
|
|
1058
1053
|
" if abs(corr_val) > 0.7:\n",
|
|
1059
1054
|
" high_corr.append((numeric_event_cols[i], numeric_event_cols[j], corr_val))\n",
|
|
1060
|
-
"
|
|
1055
|
+
"\n",
|
|
1061
1056
|
" if high_corr:\n",
|
|
1062
1057
|
" print(\"\\n⚠️ Highly correlated pairs (|r| > 0.7):\")\n",
|
|
1063
1058
|
" for c1, c2, r in sorted(high_corr, key=lambda x: abs(x[2]), reverse=True)[:5]:\n",
|
|
@@ -1093,31 +1088,31 @@
|
|
|
1093
1088
|
" entity_aggs = df.groupby(ENTITY_COLUMN).agg(agg_dict)\n",
|
|
1094
1089
|
" entity_aggs.columns = ['_'.join(col).strip() for col in entity_aggs.columns]\n",
|
|
1095
1090
|
" entity_aggs = entity_aggs.reset_index()\n",
|
|
1096
|
-
"
|
|
1091
|
+
"\n",
|
|
1097
1092
|
" # Get all numeric aggregated columns\n",
|
|
1098
1093
|
" all_agg_cols = [c for c in entity_aggs.columns if c != ENTITY_COLUMN]\n",
|
|
1099
|
-
"
|
|
1094
|
+
"\n",
|
|
1100
1095
|
" # Select top 4 by variance across ALL aggregation types\n",
|
|
1101
1096
|
" variances = entity_aggs[all_agg_cols].var().sort_values(ascending=False)\n",
|
|
1102
1097
|
" top_features = variances.head(4).index.tolist()\n",
|
|
1103
|
-
"
|
|
1098
|
+
"\n",
|
|
1104
1099
|
" # Sample if needed\n",
|
|
1105
1100
|
" sample_size = min(1000, len(entity_aggs))\n",
|
|
1106
1101
|
" scatter_sample = entity_aggs.sample(sample_size, random_state=42) if sample_size < len(entity_aggs) else entity_aggs\n",
|
|
1107
|
-
"
|
|
1102
|
+
"\n",
|
|
1108
1103
|
" print(f\"Scatter Matrix (n={len(scatter_sample):,} entities)\")\n",
|
|
1109
1104
|
" print(f\" Total aggregated features: {len(all_agg_cols)}\")\n",
|
|
1110
1105
|
" print(f\" Selected (top 4 by variance): {top_features}\")\n",
|
|
1111
|
-
"
|
|
1106
|
+
"\n",
|
|
1112
1107
|
" # Short labels for x-axis (no line breaks)\n",
|
|
1113
1108
|
" short_labels = [f.replace('_', ' ') for f in top_features]\n",
|
|
1114
|
-
"
|
|
1109
|
+
"\n",
|
|
1115
1110
|
" scatter_data = scatter_sample[top_features].copy()\n",
|
|
1116
1111
|
" scatter_data.columns = short_labels\n",
|
|
1117
|
-
"
|
|
1112
|
+
"\n",
|
|
1118
1113
|
" fig = charts.scatter_matrix(scatter_data, height=500)\n",
|
|
1119
1114
|
" fig.update_traces(marker=dict(opacity=0.5, size=4))\n",
|
|
1120
|
-
"
|
|
1115
|
+
"\n",
|
|
1121
1116
|
" # Update y-axis labels to be multirow, keep x-axis single row\n",
|
|
1122
1117
|
" n_features = len(short_labels)\n",
|
|
1123
1118
|
" for i in range(n_features):\n",
|
|
@@ -1125,19 +1120,19 @@
|
|
|
1125
1120
|
" yaxis_name = f'yaxis{i+1}' if i > 0 else 'yaxis'\n",
|
|
1126
1121
|
" y_label = top_features[i].replace('_', '<br>')\n",
|
|
1127
1122
|
" fig.update_layout(**{yaxis_name: dict(title=dict(text=y_label))})\n",
|
|
1128
|
-
"
|
|
1123
|
+
"\n",
|
|
1129
1124
|
" # X-axis: single row (spaces instead of underscores)\n",
|
|
1130
1125
|
" xaxis_name = f'xaxis{i+1}' if i > 0 else 'xaxis'\n",
|
|
1131
1126
|
" x_label = top_features[i].replace('_', ' ')\n",
|
|
1132
1127
|
" fig.update_layout(**{xaxis_name: dict(title=dict(text=x_label))})\n",
|
|
1133
|
-
"
|
|
1128
|
+
"\n",
|
|
1134
1129
|
" fig.update_layout(\n",
|
|
1135
1130
|
" title=\"Feature Relationships (Top 4 by Variance)\",\n",
|
|
1136
1131
|
" margin=dict(l=100, r=20, t=50, b=60)\n",
|
|
1137
1132
|
" )\n",
|
|
1138
|
-
"
|
|
1133
|
+
"\n",
|
|
1139
1134
|
" display_figure(fig)\n",
|
|
1140
|
-
"
|
|
1135
|
+
"\n",
|
|
1141
1136
|
" print(\"\\n📈 Scatter Matrix Insights:\")\n",
|
|
1142
1137
|
" print(\" • Different aggregation types create different patterns/bands\")\n",
|
|
1143
1138
|
" print(\" • sum features often show exponential-like distributions\")\n",
|
|
@@ -1238,31 +1233,31 @@
|
|
|
1238
1233
|
"if len(numeric_event_cols) >= 2:\n",
|
|
1239
1234
|
" variances = df[numeric_event_cols].var().sort_values(ascending=False)\n",
|
|
1240
1235
|
" sparkline_cols = variances.index.tolist()\n",
|
|
1241
|
-
"
|
|
1236
|
+
"\n",
|
|
1242
1237
|
" print(\"\\n\" + \"=\"*70)\n",
|
|
1243
1238
|
" print(\"TEMPORAL SPARKLINES - COHORT × TIME PERIOD\")\n",
|
|
1244
1239
|
" print(\"=\"*70)\n",
|
|
1245
1240
|
" print(f\"\\n{len(sparkline_cols)} features analyzed across Weekly/Monthly/Yearly periods\")\n",
|
|
1246
|
-
"
|
|
1241
|
+
"\n",
|
|
1247
1242
|
" if ENTITY_COLUMN and TIME_COLUMN:\n",
|
|
1248
1243
|
" df_spark = df.copy()\n",
|
|
1249
1244
|
" df_spark['_week'] = pd.to_datetime(df_spark[TIME_COLUMN]).dt.to_period('W').dt.start_time\n",
|
|
1250
1245
|
" df_spark['_month'] = pd.to_datetime(df_spark[TIME_COLUMN]).dt.to_period('M').dt.start_time\n",
|
|
1251
1246
|
" df_spark['_year'] = pd.to_datetime(df_spark[TIME_COLUMN]).dt.to_period('Y').dt.start_time\n",
|
|
1252
|
-
"
|
|
1247
|
+
"\n",
|
|
1253
1248
|
" has_target = TARGET_COLUMN and TARGET_COLUMN in df.columns\n",
|
|
1254
1249
|
" all_actions = []\n",
|
|
1255
|
-
"
|
|
1250
|
+
"\n",
|
|
1256
1251
|
" for col in sparkline_cols:\n",
|
|
1257
1252
|
" if col not in df_spark.columns:\n",
|
|
1258
1253
|
" continue\n",
|
|
1259
|
-
"
|
|
1254
|
+
"\n",
|
|
1260
1255
|
" feature_data = {}\n",
|
|
1261
1256
|
" cohort_masks = ([(\"retained\", df_spark[TARGET_COLUMN] == 1),\n",
|
|
1262
1257
|
" (\"churned\", df_spark[TARGET_COLUMN] == 0),\n",
|
|
1263
|
-
" (\"overall\", slice(None))] if has_target
|
|
1258
|
+
" (\"overall\", slice(None))] if has_target\n",
|
|
1264
1259
|
" else [(\"overall\", slice(None))])\n",
|
|
1265
|
-
"
|
|
1260
|
+
"\n",
|
|
1266
1261
|
" for cohort, mask in cohort_masks:\n",
|
|
1267
1262
|
" cohort_df = df_spark[mask] if isinstance(mask, pd.Series) else df_spark\n",
|
|
1268
1263
|
" feature_data[cohort] = {\n",
|
|
@@ -1270,38 +1265,38 @@
|
|
|
1270
1265
|
" \"monthly\": cohort_df.groupby('_month')[col].mean().dropna().tolist(),\n",
|
|
1271
1266
|
" \"yearly\": cohort_df.groupby('_year')[col].mean().dropna().tolist(),\n",
|
|
1272
1267
|
" }\n",
|
|
1273
|
-
"
|
|
1268
|
+
"\n",
|
|
1274
1269
|
" period_effects = None\n",
|
|
1275
1270
|
" if has_target:\n",
|
|
1276
1271
|
" analysis = charts.analyze_cohort_trends(feature_data, col)\n",
|
|
1277
|
-
" period_effects = {p: analysis[\"periods\"][p][\"effect_size\"]
|
|
1272
|
+
" period_effects = {p: analysis[\"periods\"][p][\"effect_size\"]\n",
|
|
1278
1273
|
" for p in analysis[\"periods\"]}\n",
|
|
1279
1274
|
" all_actions.extend(analysis.get(\"actions\", []))\n",
|
|
1280
|
-
"
|
|
1275
|
+
"\n",
|
|
1281
1276
|
" fig = charts.cohort_sparklines(feature_data, feature_name=col, period_effects=period_effects)\n",
|
|
1282
1277
|
" display_figure(fig)\n",
|
|
1283
|
-
"
|
|
1278
|
+
"\n",
|
|
1284
1279
|
" if has_target and all_actions:\n",
|
|
1285
1280
|
" print(\"\\n\" + \"=\"*70)\n",
|
|
1286
1281
|
" print(\"TREND & VARIANCE RECOMMENDATIONS\")\n",
|
|
1287
1282
|
" print(\"=\"*70)\n",
|
|
1288
|
-
"
|
|
1283
|
+
"\n",
|
|
1289
1284
|
" BOLD, RESET = \"\\033[1m\", \"\\033[0m\"\n",
|
|
1290
|
-
"
|
|
1285
|
+
"\n",
|
|
1291
1286
|
" type_labels = {\n",
|
|
1292
1287
|
" \"add_trend_feature\": \"📈 Add Trend Features (opposite cohort trends)\",\n",
|
|
1293
1288
|
" \"add_time_indicator\": \"📅 Add Time Indicators (seasonality detected)\",\n",
|
|
1294
1289
|
" \"robust_scale\": \"🔧 Apply Robust Scaling (high variance ratio)\",\n",
|
|
1295
1290
|
" \"normalize\": \"📊 Apply Normalization (high variance)\",\n",
|
|
1296
1291
|
" }\n",
|
|
1297
|
-
"
|
|
1292
|
+
"\n",
|
|
1298
1293
|
" by_type = {}\n",
|
|
1299
1294
|
" for action in all_actions:\n",
|
|
1300
1295
|
" action_type = action[\"action_type\"]\n",
|
|
1301
1296
|
" if action_type not in by_type:\n",
|
|
1302
1297
|
" by_type[action_type] = []\n",
|
|
1303
1298
|
" by_type[action_type].append(action)\n",
|
|
1304
|
-
"
|
|
1299
|
+
"\n",
|
|
1305
1300
|
" for action_type, actions in by_type.items():\n",
|
|
1306
1301
|
" print(f\"\\n{type_labels.get(action_type, action_type)}:\")\n",
|
|
1307
1302
|
" for a in actions:\n",
|
|
@@ -1389,37 +1384,37 @@
|
|
|
1389
1384
|
" })\n",
|
|
1390
1385
|
" entity_aggs.columns = ['_'.join(col).strip() for col in entity_aggs.columns]\n",
|
|
1391
1386
|
" entity_aggs = entity_aggs.reset_index()\n",
|
|
1392
|
-
"
|
|
1387
|
+
"\n",
|
|
1393
1388
|
" # Add target\n",
|
|
1394
1389
|
" entity_target = df.groupby(ENTITY_COLUMN)[TARGET_COLUMN].first().reset_index()\n",
|
|
1395
1390
|
" entity_df = entity_aggs.merge(entity_target, on=ENTITY_COLUMN)\n",
|
|
1396
|
-
"
|
|
1391
|
+
"\n",
|
|
1397
1392
|
" # Add derived features\n",
|
|
1398
1393
|
" entity_df['tenure_days'] = (entity_df[f'{TIME_COLUMN}_max'] - entity_df[f'{TIME_COLUMN}_min']).dt.days\n",
|
|
1399
1394
|
" entity_df['event_count'] = entity_df[f'{TIME_COLUMN}_count']\n",
|
|
1400
|
-
"
|
|
1395
|
+
"\n",
|
|
1401
1396
|
" # Calculate effect sizes (Cohen's d) for entity-level features\n",
|
|
1402
1397
|
" # Exclude entity, target, and temporal metadata columns\n",
|
|
1403
1398
|
" effect_feature_cols = [c for c in entity_df.select_dtypes(include=[np.number]).columns\n",
|
|
1404
1399
|
" if c not in [ENTITY_COLUMN, TARGET_COLUMN]\n",
|
|
1405
1400
|
" and c not in TEMPORAL_METADATA_COLS]\n",
|
|
1406
|
-
"
|
|
1401
|
+
"\n",
|
|
1407
1402
|
" print(\"=\"*80)\n",
|
|
1408
1403
|
" print(\"ENTITY-LEVEL FEATURE EFFECT SIZES (Cohen's d)\")\n",
|
|
1409
1404
|
" print(\"=\"*80)\n",
|
|
1410
1405
|
" print(f\"\\nAnalyzing {len(effect_feature_cols)} aggregated features at entity level\")\n",
|
|
1411
1406
|
" print(f\"Entities: {len(entity_df):,} (Retained: {(entity_df[TARGET_COLUMN]==1).sum():,}, Churned: {(entity_df[TARGET_COLUMN]==0).sum():,})\\n\")\n",
|
|
1412
|
-
"
|
|
1407
|
+
"\n",
|
|
1413
1408
|
" effect_sizes = []\n",
|
|
1414
1409
|
" for col in effect_feature_cols:\n",
|
|
1415
1410
|
" churned = entity_df[entity_df[TARGET_COLUMN] == 0][col].dropna()\n",
|
|
1416
1411
|
" retained = entity_df[entity_df[TARGET_COLUMN] == 1][col].dropna()\n",
|
|
1417
|
-
"
|
|
1412
|
+
"\n",
|
|
1418
1413
|
" if len(churned) > 0 and len(retained) > 0:\n",
|
|
1419
|
-
" pooled_std = np.sqrt(((len(churned)-1)*churned.std()**2 + (len(retained)-1)*retained.std()**2)
|
|
1414
|
+
" pooled_std = np.sqrt(((len(churned)-1)*churned.std()**2 + (len(retained)-1)*retained.std()**2) /\n",
|
|
1420
1415
|
" (len(churned) + len(retained) - 2))\n",
|
|
1421
1416
|
" d = (retained.mean() - churned.mean()) / pooled_std if pooled_std > 0 else 0\n",
|
|
1422
|
-
"
|
|
1417
|
+
"\n",
|
|
1423
1418
|
" abs_d = abs(d)\n",
|
|
1424
1419
|
" if abs_d >= 0.8:\n",
|
|
1425
1420
|
" interp, emoji = \"Large effect\", \"🔴\"\n",
|
|
@@ -1429,52 +1424,52 @@
|
|
|
1429
1424
|
" interp, emoji = \"Small effect\", \"🟢\"\n",
|
|
1430
1425
|
" else:\n",
|
|
1431
1426
|
" interp, emoji = \"Negligible\", \"⚪\"\n",
|
|
1432
|
-
"
|
|
1427
|
+
"\n",
|
|
1433
1428
|
" effect_sizes.append({\n",
|
|
1434
|
-
" \"feature\": col, \"cohens_d\": d, \"abs_d\": abs_d
|
|
1429
|
+
" \"feature\": col, \"cohens_d\": d, \"abs_d\": abs_d,\n",
|
|
1435
1430
|
" \"interpretation\": interp, \"emoji\": emoji,\n",
|
|
1436
1431
|
" \"retained_mean\": retained.mean(), \"churned_mean\": churned.mean()\n",
|
|
1437
1432
|
" })\n",
|
|
1438
|
-
"
|
|
1433
|
+
"\n",
|
|
1439
1434
|
" # Sort and display\n",
|
|
1440
1435
|
" effect_df = pd.DataFrame(effect_sizes).sort_values(\"abs_d\", ascending=False)\n",
|
|
1441
|
-
"
|
|
1436
|
+
"\n",
|
|
1442
1437
|
" print(f\"{'Feature':<35} {'d':>8} {'Effect':<15} {'Direction':<20}\")\n",
|
|
1443
1438
|
" print(\"-\" * 80)\n",
|
|
1444
1439
|
" for _, row in effect_df.head(15).iterrows():\n",
|
|
1445
1440
|
" direction = \"↑ Higher in retained\" if row[\"cohens_d\"] > 0 else \"↓ Lower in retained\"\n",
|
|
1446
1441
|
" print(f\"{row['emoji']} {row['feature'][:33]:<33} {row['cohens_d']:>+8.3f} {row['interpretation']:<15} {direction:<20}\")\n",
|
|
1447
|
-
"
|
|
1442
|
+
"\n",
|
|
1448
1443
|
" # Categorize features\n",
|
|
1449
1444
|
" large_effect = effect_df[effect_df[\"abs_d\"] >= 0.8][\"feature\"].tolist()\n",
|
|
1450
1445
|
" medium_effect = effect_df[(effect_df[\"abs_d\"] >= 0.5) & (effect_df[\"abs_d\"] < 0.8)][\"feature\"].tolist()\n",
|
|
1451
1446
|
" small_effect = effect_df[(effect_df[\"abs_d\"] >= 0.2) & (effect_df[\"abs_d\"] < 0.5)][\"feature\"].tolist()\n",
|
|
1452
|
-
"
|
|
1447
|
+
"\n",
|
|
1453
1448
|
" # INTERPRETATION\n",
|
|
1454
1449
|
" print(\"\\n\" + \"─\"*80)\n",
|
|
1455
1450
|
" print(\"📖 INTERPRETATION & RECOMMENDATIONS\")\n",
|
|
1456
1451
|
" print(\"─\"*80)\n",
|
|
1457
|
-
"
|
|
1452
|
+
"\n",
|
|
1458
1453
|
" if large_effect:\n",
|
|
1459
|
-
" print(
|
|
1454
|
+
" print(\"\\n🔴 LARGE EFFECT (|d| ≥ 0.8) - Priority Features:\")\n",
|
|
1460
1455
|
" for f in large_effect[:5]:\n",
|
|
1461
1456
|
" row = effect_df[effect_df[\"feature\"] == f].iloc[0]\n",
|
|
1462
1457
|
" direction = \"higher\" if row[\"cohens_d\"] > 0 else \"lower\"\n",
|
|
1463
1458
|
" print(f\" • {f}: Retained customers have {direction} values\")\n",
|
|
1464
1459
|
" print(f\" Mean: Retained={row['retained_mean']:.2f}, Churned={row['churned_mean']:.2f}\")\n",
|
|
1465
1460
|
" print(\" → MUST include in predictive model\")\n",
|
|
1466
|
-
"
|
|
1461
|
+
"\n",
|
|
1467
1462
|
" if medium_effect:\n",
|
|
1468
|
-
" print(
|
|
1463
|
+
" print(\"\\n🟡 MEDIUM EFFECT (0.5 ≤ |d| < 0.8) - Useful Features:\")\n",
|
|
1469
1464
|
" for f in medium_effect[:3]:\n",
|
|
1470
1465
|
" print(f\" • {f}\")\n",
|
|
1471
1466
|
" print(\" → Should include in model\")\n",
|
|
1472
|
-
"
|
|
1467
|
+
"\n",
|
|
1473
1468
|
" if small_effect:\n",
|
|
1474
|
-
" print(
|
|
1469
|
+
" print(\"\\n🟢 SMALL EFFECT (0.2 ≤ |d| < 0.5) - Supporting Features:\")\n",
|
|
1475
1470
|
" print(f\" {', '.join(small_effect[:5])}\")\n",
|
|
1476
1471
|
" print(\" → May help in combination with other features\")\n",
|
|
1477
|
-
"
|
|
1472
|
+
"\n",
|
|
1478
1473
|
" negligible = effect_df[effect_df[\"abs_d\"] < 0.2][\"feature\"].tolist()\n",
|
|
1479
1474
|
" if negligible:\n",
|
|
1480
1475
|
" print(f\"\\n⚪ NEGLIGIBLE EFFECT (|d| < 0.2): {len(negligible)} features\")\n",
|
|
@@ -1510,39 +1505,39 @@
|
|
|
1510
1505
|
" # Select top features by effect size for visualization\n",
|
|
1511
1506
|
" top_features = effect_df.head(6)[\"feature\"].tolist()\n",
|
|
1512
1507
|
" n_features = len(top_features)\n",
|
|
1513
|
-
"
|
|
1508
|
+
"\n",
|
|
1514
1509
|
" if n_features > 0:\n",
|
|
1515
1510
|
" print(\"=\"*70)\n",
|
|
1516
1511
|
" print(\"DISTRIBUTION COMPARISON: Retained vs Churned (Box Plots)\")\n",
|
|
1517
1512
|
" print(\"=\"*70)\n",
|
|
1518
1513
|
" print(\"\\n📊 Showing top 6 features by effect size\")\n",
|
|
1519
1514
|
" print(\" 🟢 Green = Retained | 🔴 Red = Churned\\n\")\n",
|
|
1520
|
-
"
|
|
1515
|
+
"\n",
|
|
1521
1516
|
" fig = make_subplots(rows=1, cols=n_features, subplot_titles=top_features, horizontal_spacing=0.05)\n",
|
|
1522
|
-
"
|
|
1517
|
+
"\n",
|
|
1523
1518
|
" for i, col in enumerate(top_features):\n",
|
|
1524
1519
|
" col_num = i + 1\n",
|
|
1525
|
-
"
|
|
1520
|
+
"\n",
|
|
1526
1521
|
" # Retained (1) - Green\n",
|
|
1527
1522
|
" retained_data = entity_df[entity_df[TARGET_COLUMN] == 1][col].dropna()\n",
|
|
1528
1523
|
" fig.add_trace(go.Box(y=retained_data, name='Retained',\n",
|
|
1529
1524
|
" fillcolor='rgba(46, 204, 113, 0.7)', line=dict(color='#1e8449', width=2),\n",
|
|
1530
1525
|
" boxpoints='outliers', width=0.35, showlegend=(i == 0), legendgroup='retained',\n",
|
|
1531
1526
|
" marker=dict(color='rgba(46, 204, 113, 0.5)', size=4)), row=1, col=col_num)\n",
|
|
1532
|
-
"
|
|
1527
|
+
"\n",
|
|
1533
1528
|
" # Churned (0) - Red\n",
|
|
1534
1529
|
" churned_data = entity_df[entity_df[TARGET_COLUMN] == 0][col].dropna()\n",
|
|
1535
1530
|
" fig.add_trace(go.Box(y=churned_data, name='Churned',\n",
|
|
1536
1531
|
" fillcolor='rgba(231, 76, 60, 0.7)', line=dict(color='#922b21', width=2),\n",
|
|
1537
1532
|
" boxpoints='outliers', width=0.35, showlegend=(i == 0), legendgroup='churned',\n",
|
|
1538
1533
|
" marker=dict(color='rgba(231, 76, 60, 0.5)', size=4)), row=1, col=col_num)\n",
|
|
1539
|
-
"
|
|
1534
|
+
"\n",
|
|
1540
1535
|
" fig.update_layout(height=450, title_text=\"Top Features: Retained (Green) vs Churned (Red)\",\n",
|
|
1541
1536
|
" template='plotly_white', showlegend=True, boxmode='group',\n",
|
|
1542
1537
|
" legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.05, xanchor=\"center\", x=0.5))\n",
|
|
1543
1538
|
" fig.update_xaxes(showticklabels=False)\n",
|
|
1544
1539
|
" display_figure(fig)\n",
|
|
1545
|
-
"
|
|
1540
|
+
"\n",
|
|
1546
1541
|
" # INTERPRETATION\n",
|
|
1547
1542
|
" print(\"─\"*70)\n",
|
|
1548
1543
|
" print(\"📖 HOW TO READ BOX PLOTS\")\n",
|
|
@@ -1594,28 +1589,28 @@
|
|
|
1594
1589
|
" print(\"=\"*70)\n",
|
|
1595
1590
|
" print(\"FEATURE-TARGET CORRELATIONS (Entity-Level)\")\n",
|
|
1596
1591
|
" print(\"=\"*70)\n",
|
|
1597
|
-
"
|
|
1592
|
+
"\n",
|
|
1598
1593
|
" correlations = []\n",
|
|
1599
1594
|
" for col in effect_feature_cols:\n",
|
|
1600
1595
|
" if col != TARGET_COLUMN:\n",
|
|
1601
1596
|
" corr = entity_df[[col, TARGET_COLUMN]].corr().iloc[0, 1]\n",
|
|
1602
1597
|
" if not np.isnan(corr):\n",
|
|
1603
1598
|
" correlations.append({\"Feature\": col, \"Correlation\": corr})\n",
|
|
1604
|
-
"
|
|
1599
|
+
"\n",
|
|
1605
1600
|
" if correlations:\n",
|
|
1606
1601
|
" corr_df = pd.DataFrame(correlations).sort_values(\"Correlation\", key=abs, ascending=False)\n",
|
|
1607
|
-
"
|
|
1602
|
+
"\n",
|
|
1608
1603
|
" fig = charts.bar_chart(\n",
|
|
1609
1604
|
" corr_df[\"Feature\"].head(12).tolist(),\n",
|
|
1610
1605
|
" corr_df[\"Correlation\"].head(12).tolist(),\n",
|
|
1611
1606
|
" title=f\"Feature Correlations with {TARGET_COLUMN}\"\n",
|
|
1612
1607
|
" )\n",
|
|
1613
1608
|
" display_figure(fig)\n",
|
|
1614
|
-
"
|
|
1609
|
+
"\n",
|
|
1615
1610
|
" print(\"\\n📊 Correlation Rankings:\")\n",
|
|
1616
1611
|
" print(f\"{'Feature':<35} {'Correlation':>12} {'Strength':<15} {'Direction'}\")\n",
|
|
1617
1612
|
" print(\"-\" * 75)\n",
|
|
1618
|
-
"
|
|
1613
|
+
"\n",
|
|
1619
1614
|
" for _, row in corr_df.head(10).iterrows():\n",
|
|
1620
1615
|
" abs_corr = abs(row[\"Correlation\"])\n",
|
|
1621
1616
|
" if abs_corr >= 0.5:\n",
|
|
@@ -1626,10 +1621,10 @@
|
|
|
1626
1621
|
" strength = \"Weak\"\n",
|
|
1627
1622
|
" else:\n",
|
|
1628
1623
|
" strength = \"Very weak\"\n",
|
|
1629
|
-
"
|
|
1624
|
+
"\n",
|
|
1630
1625
|
" direction = \"Positive\" if row[\"Correlation\"] > 0 else \"Negative\"\n",
|
|
1631
1626
|
" print(f\"{row['Feature'][:34]:<35} {row['Correlation']:>+12.3f} {strength:<15} {direction}\")\n",
|
|
1632
|
-
"
|
|
1627
|
+
"\n",
|
|
1633
1628
|
" # INTERPRETATION\n",
|
|
1634
1629
|
" print(\"\\n\" + \"─\"*70)\n",
|
|
1635
1630
|
" print(\"📖 INTERPRETING CORRELATIONS WITH TARGET\")\n",
|
|
@@ -1680,22 +1675,22 @@
|
|
|
1680
1675
|
"\n",
|
|
1681
1676
|
"if 'effect_df' in dir() and len(effect_df) > 0:\n",
|
|
1682
1677
|
" large_effects = effect_df[effect_df['cohens_d'].abs() >= 0.5]\n",
|
|
1683
|
-
" print(
|
|
1678
|
+
" print(\"\\n📊 Effect Size Summary:\")\n",
|
|
1684
1679
|
" print(f\" • Total features analyzed: {len(effect_df)}\")\n",
|
|
1685
1680
|
" print(f\" • Features with |d| ≥ 0.5 (medium+): {len(large_effects)}\")\n",
|
|
1686
1681
|
" print(f\" • Features with |d| < 0.2 (negligible): {(effect_df['cohens_d'].abs() < 0.2).sum()}\")\n",
|
|
1687
|
-
"
|
|
1682
|
+
"\n",
|
|
1688
1683
|
" if len(large_effects) > 0:\n",
|
|
1689
1684
|
" print(\"\\n Top differentiators:\")\n",
|
|
1690
1685
|
" for _, row in large_effects.head(5).iterrows():\n",
|
|
1691
1686
|
" direction = \"↑ higher in retained\" if row['cohens_d'] > 0 else \"↓ lower in retained\"\n",
|
|
1692
1687
|
" print(f\" • \\033[1m{row['feature']}\\033[0m: d={row['cohens_d']:+.2f} ({direction})\")\n",
|
|
1693
|
-
"
|
|
1688
|
+
"\n",
|
|
1694
1689
|
" print(\"\\n📋 What the Three Approaches Showed:\")\n",
|
|
1695
1690
|
" print(\" • Cohen's d → identified features with strongest mean separation\")\n",
|
|
1696
1691
|
" print(\" • Correlation → confirmed linear relationship direction\")\n",
|
|
1697
1692
|
" print(\" • Box plots → revealed distribution shapes and outliers\")\n",
|
|
1698
|
-
"
|
|
1693
|
+
"\n",
|
|
1699
1694
|
" print(\"\\n💡 RECOMMENDATIONS:\")\n",
|
|
1700
1695
|
" print(\" → Prioritize features with |d| > 0.5 in model\")\n",
|
|
1701
1696
|
" print(\" → Consider dropping features with |d| < 0.2\")\n",
|
|
@@ -1791,15 +1786,15 @@
|
|
|
1791
1786
|
"\n",
|
|
1792
1787
|
"if ENTITY_COLUMN:\n",
|
|
1793
1788
|
" reference_date = df[TIME_COLUMN].max()\n",
|
|
1794
|
-
"
|
|
1789
|
+
"\n",
|
|
1795
1790
|
" # Compute recency_result for use in summary cells\n",
|
|
1796
1791
|
" recency_result = analyzer.analyze_recency(df, ENTITY_COLUMN, TARGET_COLUMN, reference_date)\n",
|
|
1797
|
-
"
|
|
1792
|
+
"\n",
|
|
1798
1793
|
" if TARGET_COLUMN and TARGET_COLUMN in df.columns:\n",
|
|
1799
1794
|
" recency_comparison = compare_recency_by_target(\n",
|
|
1800
1795
|
" df, ENTITY_COLUMN, TIME_COLUMN, TARGET_COLUMN, reference_date\n",
|
|
1801
1796
|
" )\n",
|
|
1802
|
-
"
|
|
1797
|
+
"\n",
|
|
1803
1798
|
" if recency_comparison:\n",
|
|
1804
1799
|
" # Combined visualization panel\n",
|
|
1805
1800
|
" entity_last = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].max().reset_index()\n",
|
|
@@ -1808,10 +1803,10 @@
|
|
|
1808
1803
|
" entity_recency = entity_last.merge(entity_target, on=ENTITY_COLUMN)\n",
|
|
1809
1804
|
" cap = entity_recency[\"recency_days\"].quantile(0.99)\n",
|
|
1810
1805
|
" entity_capped = entity_recency[entity_recency[\"recency_days\"] <= cap]\n",
|
|
1811
|
-
"
|
|
1806
|
+
"\n",
|
|
1812
1807
|
" retained = entity_capped[entity_capped[TARGET_COLUMN] == 1][\"recency_days\"].values\n",
|
|
1813
1808
|
" churned = entity_capped[entity_capped[TARGET_COLUMN] == 0][\"recency_days\"].values\n",
|
|
1814
|
-
"
|
|
1809
|
+
"\n",
|
|
1815
1810
|
" fig = charts.recency_analysis_panel(\n",
|
|
1816
1811
|
" retained_recency=retained,\n",
|
|
1817
1812
|
" churned_recency=churned,\n",
|
|
@@ -1821,14 +1816,14 @@
|
|
|
1821
1816
|
" cap_value=cap\n",
|
|
1822
1817
|
" )\n",
|
|
1823
1818
|
" display_figure(fig)\n",
|
|
1824
|
-
"
|
|
1819
|
+
"\n",
|
|
1825
1820
|
" # Key Findings\n",
|
|
1826
1821
|
" console.start_section()\n",
|
|
1827
1822
|
" console.header(\"Key Findings\")\n",
|
|
1828
1823
|
" for insight in recency_comparison.key_findings:\n",
|
|
1829
1824
|
" console.info(insight.finding)\n",
|
|
1830
1825
|
" console.end_section()\n",
|
|
1831
|
-
"
|
|
1826
|
+
"\n",
|
|
1832
1827
|
" # Statistics\n",
|
|
1833
1828
|
" ret, churn = recency_comparison.retained_stats, recency_comparison.churned_stats\n",
|
|
1834
1829
|
" console.start_section()\n",
|
|
@@ -1837,7 +1832,7 @@
|
|
|
1837
1832
|
" console.metric(\"Churned (n)\", f\"{churn.count:,}\")\n",
|
|
1838
1833
|
" print(f\"{'Metric':<15} {'Retained':>12} {'Churned':>12} {'Diff':>12}\")\n",
|
|
1839
1834
|
" print(\"-\" * 52)\n",
|
|
1840
|
-
" for name, r, c in [(\"Mean\", ret.mean, churn.mean), (\"Median\", ret.median, churn.median)
|
|
1835
|
+
" for name, r, c in [(\"Mean\", ret.mean, churn.mean), (\"Median\", ret.median, churn.median),\n",
|
|
1841
1836
|
" (\"Std Dev\", ret.std, churn.std)]:\n",
|
|
1842
1837
|
" print(f\"{name:<15} {r:>12.1f} {c:>12.1f} {c-r:>+12.1f}\")\n",
|
|
1843
1838
|
" console.metric(\"Effect Size\", f\"{recency_comparison.cohens_d:+.2f} ({recency_comparison.effect_interpretation})\")\n",
|
|
@@ -1845,7 +1840,7 @@
|
|
|
1845
1840
|
" if recency_comparison.inflection_bucket:\n",
|
|
1846
1841
|
" console.metric(\"Inflection\", recency_comparison.inflection_bucket)\n",
|
|
1847
1842
|
" console.end_section()\n",
|
|
1848
|
-
"
|
|
1843
|
+
"\n",
|
|
1849
1844
|
" # Actionable Recommendations\n",
|
|
1850
1845
|
" console.start_section()\n",
|
|
1851
1846
|
" console.header(\"Actionable Recommendations\")\n",
|
|
@@ -1865,13 +1860,13 @@
|
|
|
1865
1860
|
" median_recency = entity_last[\"recency_days\"].median()\n",
|
|
1866
1861
|
" cap = entity_last[\"recency_days\"].quantile(0.99)\n",
|
|
1867
1862
|
" capped = entity_last[entity_last[\"recency_days\"] <= cap]\n",
|
|
1868
|
-
"
|
|
1863
|
+
"\n",
|
|
1869
1864
|
" fig = go.Figure()\n",
|
|
1870
1865
|
" fig.add_trace(go.Histogram(x=capped[\"recency_days\"], nbinsx=50, marker_color=\"coral\", opacity=0.7))\n",
|
|
1871
1866
|
" fig.add_vline(x=median_recency, line_dash=\"solid\", line_color=\"green\", annotation_text=f\"Median: {median_recency:.0f} days\")\n",
|
|
1872
1867
|
" fig.update_layout(title=f\"Recency Distribution (capped at {cap:.0f} days)\", xaxis_title=\"Days Since Last Event\", yaxis_title=\"Count\", template=\"plotly_white\", height=400)\n",
|
|
1873
1868
|
" display_figure(fig)\n",
|
|
1874
|
-
"
|
|
1869
|
+
"\n",
|
|
1875
1870
|
" console.start_section()\n",
|
|
1876
1871
|
" console.header(\"Recency Statistics\")\n",
|
|
1877
1872
|
" console.metric(\"Median\", f\"{median_recency:.0f} days\")\n",
|
|
@@ -1939,23 +1934,23 @@
|
|
|
1939
1934
|
"# Velocity & Acceleration Cohort Analysis with Effect Size Heatmap\n",
|
|
1940
1935
|
"if ENTITY_COLUMN and TARGET_COLUMN and sparkline_cols:\n",
|
|
1941
1936
|
" continuous_cols = [c for c in sparkline_cols if df[c].nunique() > 2][:6]\n",
|
|
1942
|
-
"
|
|
1937
|
+
"\n",
|
|
1943
1938
|
" if not continuous_cols:\n",
|
|
1944
1939
|
" print(\"⚠️ No continuous numeric columns found for velocity analysis.\")\n",
|
|
1945
1940
|
" else:\n",
|
|
1946
1941
|
" print(\"=\"*70)\n",
|
|
1947
1942
|
" print(\"VELOCITY & ACCELERATION SIGNAL ANALYSIS\")\n",
|
|
1948
1943
|
" print(\"=\"*70)\n",
|
|
1949
|
-
"
|
|
1944
|
+
"\n",
|
|
1950
1945
|
" if 'feature_analyzer' not in dir():\n",
|
|
1951
1946
|
" feature_analyzer = TemporalFeatureAnalyzer(time_column=TIME_COLUMN, entity_column=ENTITY_COLUMN)\n",
|
|
1952
|
-
"
|
|
1947
|
+
"\n",
|
|
1953
1948
|
" windows = [7, 14, 30, 90, 180, 365]\n",
|
|
1954
1949
|
" print(f\"Analyzing {len(continuous_cols)} features across windows: {windows} days\")\n",
|
|
1955
|
-
"
|
|
1950
|
+
"\n",
|
|
1956
1951
|
" all_results = {}\n",
|
|
1957
1952
|
" heatmap_data = {\"velocity\": {}, \"acceleration\": {}}\n",
|
|
1958
|
-
"
|
|
1953
|
+
"\n",
|
|
1959
1954
|
" for col in continuous_cols:\n",
|
|
1960
1955
|
" results = feature_analyzer.compute_cohort_velocity_signals(\n",
|
|
1961
1956
|
" df, [col], TARGET_COLUMN, windows=windows\n",
|
|
@@ -1963,38 +1958,38 @@
|
|
|
1963
1958
|
" all_results[col] = results[col]\n",
|
|
1964
1959
|
" heatmap_data[\"velocity\"][col] = {f\"{r.window_days}d\": r.velocity_effect_size for r in results[col]}\n",
|
|
1965
1960
|
" heatmap_data[\"acceleration\"][col] = {f\"{r.window_days}d\": r.accel_effect_size for r in results[col]}\n",
|
|
1966
|
-
"
|
|
1961
|
+
"\n",
|
|
1967
1962
|
" fig = charts.velocity_signal_heatmap(heatmap_data, title=\"Cohort Separation: Velocity & Acceleration Effect Sizes (Cohen's d)\")\n",
|
|
1968
1963
|
" display_figure(fig)\n",
|
|
1969
|
-
"
|
|
1964
|
+
"\n",
|
|
1970
1965
|
" print(\"\\n\" + \"=\"*70)\n",
|
|
1971
1966
|
" print(\"DETAILED SPARKLINES (top features)\")\n",
|
|
1972
1967
|
" print(\"=\"*70)\n",
|
|
1973
|
-
"
|
|
1974
|
-
" feature_max_d = [(col, max(abs(r.velocity_effect_size) for r in results))
|
|
1968
|
+
"\n",
|
|
1969
|
+
" feature_max_d = [(col, max(abs(r.velocity_effect_size) for r in results))\n",
|
|
1975
1970
|
" for col, results in all_results.items()]\n",
|
|
1976
1971
|
" feature_max_d.sort(key=lambda x: -x[1])\n",
|
|
1977
|
-
"
|
|
1972
|
+
"\n",
|
|
1978
1973
|
" top_features = [col for col, _ in feature_max_d[:3]]\n",
|
|
1979
1974
|
" for col in top_features:\n",
|
|
1980
1975
|
" fig = charts.cohort_velocity_sparklines(all_results[col], feature_name=col)\n",
|
|
1981
1976
|
" display_figure(fig)\n",
|
|
1982
|
-
"
|
|
1977
|
+
"\n",
|
|
1983
1978
|
" print(\"\\n\" + \"─\"*70)\n",
|
|
1984
1979
|
" print(\"📖 INTERPRETATION\")\n",
|
|
1985
1980
|
" print(\"─\"*70)\n",
|
|
1986
1981
|
" print(\"\\nVelocity measures rate of change; acceleration measures change in rate.\")\n",
|
|
1987
1982
|
" print(\"Positive d: retained > churned | Negative d: churned > retained\")\n",
|
|
1988
1983
|
" print(\"|d| ≥ 0.8: large effect | ≥ 0.5: medium | ≥ 0.2: small\\n\")\n",
|
|
1989
|
-
"
|
|
1984
|
+
"\n",
|
|
1990
1985
|
" interpretation_notes = feature_analyzer.generate_velocity_interpretation(all_results)\n",
|
|
1991
1986
|
" for note in interpretation_notes:\n",
|
|
1992
1987
|
" print(note)\n",
|
|
1993
|
-
"
|
|
1988
|
+
"\n",
|
|
1994
1989
|
" print(\"\\n\" + \"─\"*70)\n",
|
|
1995
1990
|
" print(\"🎯 FEATURE RECOMMENDATIONS\")\n",
|
|
1996
1991
|
" print(\"─\"*70)\n",
|
|
1997
|
-
"
|
|
1992
|
+
"\n",
|
|
1998
1993
|
" velocity_recs = feature_analyzer.generate_velocity_recommendations(all_results)\n",
|
|
1999
1994
|
" if velocity_recs:\n",
|
|
2000
1995
|
" for rec in velocity_recs:\n",
|
|
@@ -2007,7 +2002,7 @@
|
|
|
2007
2002
|
" print(\"\\nNo velocity/acceleration features recommended (no strong signals found).\")\n",
|
|
2008
2003
|
"\n",
|
|
2009
2004
|
"# Store velocity recommendations for pattern_summary\n",
|
|
2010
|
-
"VELOCITY_RECOMMENDATIONS = [{\"action\": r.action, \"source_column\": r.source_column
|
|
2005
|
+
"VELOCITY_RECOMMENDATIONS = [{\"action\": r.action, \"source_column\": r.source_column,\n",
|
|
2011
2006
|
" \"description\": r.description, \"priority\": r.priority,\n",
|
|
2012
2007
|
" \"effect_size\": r.effect_size, \"params\": r.params,\n",
|
|
2013
2008
|
" \"features\": [f\"{r.source_column}_velocity_{r.params.get('window_days', 7)}d\"]}\n",
|
|
@@ -2079,26 +2074,26 @@
|
|
|
2079
2074
|
" print(\"=\"*70)\n",
|
|
2080
2075
|
" print(\"MOMENTUM ANALYSIS (Window Ratios)\")\n",
|
|
2081
2076
|
" print(\"=\"*70)\n",
|
|
2082
|
-
"
|
|
2077
|
+
"\n",
|
|
2083
2078
|
" if 'feature_analyzer' not in dir():\n",
|
|
2084
2079
|
" feature_analyzer = TemporalFeatureAnalyzer(time_column=TIME_COLUMN, entity_column=ENTITY_COLUMN)\n",
|
|
2085
|
-
"
|
|
2080
|
+
"\n",
|
|
2086
2081
|
" # Use sparkline_cols directly (includes all numeric features ranked by variance)\n",
|
|
2087
2082
|
" momentum_cols = sparkline_cols[:6]\n",
|
|
2088
|
-
"
|
|
2083
|
+
"\n",
|
|
2089
2084
|
" # Build comprehensive window pairs from multiple sources:\n",
|
|
2090
2085
|
" # 1. Standard natural pairs (week/month/quarter)\n",
|
|
2091
2086
|
" natural_pairs = [(7, 30), (30, 90), (7, 90)]\n",
|
|
2092
|
-
"
|
|
2087
|
+
"\n",
|
|
2093
2088
|
" # 2. Recommended pairs from pattern_config (based on 01a aggregation windows)\n",
|
|
2094
2089
|
" recommended_pairs = pattern_config.get_momentum_pairs()\n",
|
|
2095
|
-
"
|
|
2090
|
+
"\n",
|
|
2096
2091
|
" # 3. Accumulation pair: shortest window vs all-time\n",
|
|
2097
2092
|
" max_days = (df[TIME_COLUMN].max() - df[TIME_COLUMN].min()).days\n",
|
|
2098
2093
|
" all_windows = [w for pair in natural_pairs + recommended_pairs for w in pair]\n",
|
|
2099
2094
|
" shortest_window = min(all_windows) if all_windows else 7\n",
|
|
2100
2095
|
" accumulation_pair = (shortest_window, max_days)\n",
|
|
2101
|
-
"
|
|
2096
|
+
"\n",
|
|
2102
2097
|
" # Combine and deduplicate (preserve order: natural first, then recommended, then accumulation)\n",
|
|
2103
2098
|
" seen = set()\n",
|
|
2104
2099
|
" window_pairs = []\n",
|
|
@@ -2106,20 +2101,20 @@
|
|
|
2106
2101
|
" if pair not in seen:\n",
|
|
2107
2102
|
" window_pairs.append(pair)\n",
|
|
2108
2103
|
" seen.add(pair)\n",
|
|
2109
|
-
"
|
|
2104
|
+
"\n",
|
|
2110
2105
|
" print(f\"Analyzing {len(momentum_cols)} features across {len(window_pairs)} window pairs:\")\n",
|
|
2111
2106
|
" print(f\" Natural pairs (week/month/quarter): {natural_pairs}\")\n",
|
|
2112
2107
|
" print(f\" Recommended pairs (from 01a): {recommended_pairs}\")\n",
|
|
2113
2108
|
" print(f\" Accumulation pair: {shortest_window}d vs all-time ({max_days}d)\")\n",
|
|
2114
2109
|
" print(f\" Combined (deduplicated): {len(window_pairs)} pairs\")\n",
|
|
2115
|
-
"
|
|
2110
|
+
"\n",
|
|
2116
2111
|
" all_momentum_results = {}\n",
|
|
2117
2112
|
" for col in momentum_cols:\n",
|
|
2118
2113
|
" results = feature_analyzer.compute_cohort_momentum_signals(\n",
|
|
2119
2114
|
" df, [col], TARGET_COLUMN, window_pairs=window_pairs\n",
|
|
2120
2115
|
" )\n",
|
|
2121
2116
|
" all_momentum_results[col] = results[col]\n",
|
|
2122
|
-
"
|
|
2117
|
+
"\n",
|
|
2123
2118
|
" print(\"\\n📊 Momentum by Cohort:\")\n",
|
|
2124
2119
|
" print(f\"{'Feature':<18} {'Window':<12} {'Retained':>10} {'Churned':>10} {'Effect d':>10}\")\n",
|
|
2125
2120
|
" print(\"-\" * 62)\n",
|
|
@@ -2127,7 +2122,7 @@
|
|
|
2127
2122
|
" for r in col_results:\n",
|
|
2128
2123
|
" label = r.window_label if r.long_window < 1000 else f\"{r.short_window}d/all\"\n",
|
|
2129
2124
|
" print(f\"{col[:17]:<18} {label:<12} {r.retained_momentum:>10.2f} {r.churned_momentum:>10.2f} {r.effect_size:>10.2f}\")\n",
|
|
2130
|
-
"
|
|
2125
|
+
"\n",
|
|
2131
2126
|
" # Bar chart for best window pair per feature - with window labels above bars\n",
|
|
2132
2127
|
" best_pair_data = {}\n",
|
|
2133
2128
|
" best_window_labels = {} # Track which window was best\n",
|
|
@@ -2135,15 +2130,15 @@
|
|
|
2135
2130
|
" best = max(col_results, key=lambda r: abs(r.effect_size))\n",
|
|
2136
2131
|
" best_pair_data[col] = {\"retained\": best.retained_momentum, \"churned\": best.churned_momentum}\n",
|
|
2137
2132
|
" best_window_labels[col] = best.window_label if best.long_window < 1000 else f\"{best.short_window}d/all\"\n",
|
|
2138
|
-
"
|
|
2133
|
+
"\n",
|
|
2139
2134
|
" if best_pair_data:\n",
|
|
2140
2135
|
" import plotly.graph_objects as go\n",
|
|
2141
2136
|
" columns = list(best_pair_data.keys())\n",
|
|
2142
2137
|
" col_labels = [c[:15] for c in columns]\n",
|
|
2143
|
-
"
|
|
2138
|
+
"\n",
|
|
2144
2139
|
" # Find max y value for positioning labels above bars\n",
|
|
2145
2140
|
" max_y = max(max(best_pair_data[c][\"retained\"], best_pair_data[c][\"churned\"]) for c in columns)\n",
|
|
2146
|
-
"
|
|
2141
|
+
"\n",
|
|
2147
2142
|
" fig = go.Figure()\n",
|
|
2148
2143
|
" fig.add_trace(go.Bar(\n",
|
|
2149
2144
|
" name=\"🟢 Retained\", x=col_labels,\n",
|
|
@@ -2157,7 +2152,7 @@
|
|
|
2157
2152
|
" ))\n",
|
|
2158
2153
|
" fig.add_hline(y=1.0, line_dash=\"dash\", line_color=\"gray\",\n",
|
|
2159
2154
|
" annotation_text=\"baseline\", annotation_position=\"right\")\n",
|
|
2160
|
-
"
|
|
2155
|
+
"\n",
|
|
2161
2156
|
" # Add window labels as annotations above each bar group\n",
|
|
2162
2157
|
" for i, col in enumerate(columns):\n",
|
|
2163
2158
|
" window_lbl = best_window_labels[col]\n",
|
|
@@ -2168,7 +2163,7 @@
|
|
|
2168
2163
|
" font=dict(size=10, color=\"#555\"),\n",
|
|
2169
2164
|
" xref=\"x\", yref=\"y\",\n",
|
|
2170
2165
|
" )\n",
|
|
2171
|
-
"
|
|
2166
|
+
"\n",
|
|
2172
2167
|
" fig.update_layout(\n",
|
|
2173
2168
|
" title=\"Momentum Comparison (Best Window per Feature)\",\n",
|
|
2174
2169
|
" xaxis_title=\"Feature\",\n",
|
|
@@ -2178,22 +2173,22 @@
|
|
|
2178
2173
|
" yaxis=dict(range=[0, max_y * 1.15]), # Extra headroom for labels\n",
|
|
2179
2174
|
" )\n",
|
|
2180
2175
|
" display_figure(fig)\n",
|
|
2181
|
-
"
|
|
2176
|
+
"\n",
|
|
2182
2177
|
" print(\"\\n\" + \"─\"*70)\n",
|
|
2183
2178
|
" print(\"📖 INTERPRETATION\")\n",
|
|
2184
2179
|
" print(\"─\"*70)\n",
|
|
2185
2180
|
" print(\"\\nMomentum = recent_mean / historical_mean (per entity, then averaged)\")\n",
|
|
2186
2181
|
" print(\"> 1.0 = accelerating | < 1.0 = decelerating | ≈ 1.0 = stable\")\n",
|
|
2187
2182
|
" print(\"|d| measures how differently retained vs churned customers behave\\n\")\n",
|
|
2188
|
-
"
|
|
2183
|
+
"\n",
|
|
2189
2184
|
" interpretation_notes = feature_analyzer.generate_momentum_interpretation(all_momentum_results)\n",
|
|
2190
2185
|
" for note in interpretation_notes:\n",
|
|
2191
2186
|
" print(note)\n",
|
|
2192
|
-
"
|
|
2187
|
+
"\n",
|
|
2193
2188
|
" print(\"\\n\" + \"─\"*70)\n",
|
|
2194
2189
|
" print(\"🎯 FEATURE RECOMMENDATIONS\")\n",
|
|
2195
2190
|
" print(\"─\"*70)\n",
|
|
2196
|
-
"
|
|
2191
|
+
"\n",
|
|
2197
2192
|
" momentum_recs = feature_analyzer.generate_momentum_recommendations(all_momentum_results)\n",
|
|
2198
2193
|
" if momentum_recs:\n",
|
|
2199
2194
|
" for rec in momentum_recs:\n",
|
|
@@ -2206,7 +2201,7 @@
|
|
|
2206
2201
|
" print(\"\\nNo momentum features recommended (no strong cohort separation found).\")\n",
|
|
2207
2202
|
"\n",
|
|
2208
2203
|
"# Store momentum recommendations for pattern_summary\n",
|
|
2209
|
-
"MOMENTUM_RECOMMENDATIONS = [{\"action\": r.action, \"source_column\": r.source_column
|
|
2204
|
+
"MOMENTUM_RECOMMENDATIONS = [{\"action\": r.action, \"source_column\": r.source_column,\n",
|
|
2210
2205
|
" \"description\": r.description, \"priority\": r.priority,\n",
|
|
2211
2206
|
" \"effect_size\": r.effect_size, \"params\": r.params,\n",
|
|
2212
2207
|
" \"features\": [f\"{r.source_column}_momentum_{r.params['short_window']}_{r.params['long_window']}\"]}\n",
|
|
@@ -2263,20 +2258,20 @@
|
|
|
2263
2258
|
"if ENTITY_COLUMN and sparkline_cols:\n",
|
|
2264
2259
|
" lag_cols = sparkline_cols[:6]\n",
|
|
2265
2260
|
" max_lag = 14\n",
|
|
2266
|
-
"
|
|
2261
|
+
"\n",
|
|
2267
2262
|
" print(\"=\"*70)\n",
|
|
2268
2263
|
" print(\"LAG CORRELATION ANALYSIS\")\n",
|
|
2269
2264
|
" print(\"=\"*70)\n",
|
|
2270
|
-
"
|
|
2265
|
+
"\n",
|
|
2271
2266
|
" if 'feature_analyzer' not in dir():\n",
|
|
2272
2267
|
" feature_analyzer = TemporalFeatureAnalyzer(time_column=TIME_COLUMN, entity_column=ENTITY_COLUMN)\n",
|
|
2273
|
-
"
|
|
2268
|
+
"\n",
|
|
2274
2269
|
" # Calculate lag correlations using framework\n",
|
|
2275
2270
|
" lag_results = feature_analyzer.calculate_lag_correlations(df, lag_cols, max_lag=max_lag)\n",
|
|
2276
|
-
"
|
|
2271
|
+
"\n",
|
|
2277
2272
|
" # Build data for heatmap\n",
|
|
2278
2273
|
" lag_corr_data = {col: result.correlations for col, result in lag_results.items()}\n",
|
|
2279
|
-
"
|
|
2274
|
+
"\n",
|
|
2280
2275
|
" # Use ChartBuilder for visualization\n",
|
|
2281
2276
|
" fig = charts.lag_correlation_heatmap(\n",
|
|
2282
2277
|
" lag_corr_data,\n",
|
|
@@ -2284,14 +2279,14 @@
|
|
|
2284
2279
|
" title=\"Autocorrelation by Lag (days)\"\n",
|
|
2285
2280
|
" )\n",
|
|
2286
2281
|
" display_figure(fig)\n",
|
|
2287
|
-
"
|
|
2282
|
+
"\n",
|
|
2288
2283
|
" # Display framework results\n",
|
|
2289
2284
|
" print(\"\\n📊 Best Lag per Variable:\")\n",
|
|
2290
2285
|
" for col, result in lag_results.items():\n",
|
|
2291
2286
|
" best_lag_info = f\"best lag={result.best_lag}d (r={result.best_correlation:.2f})\"\n",
|
|
2292
2287
|
" weekly_info = \" [Weekly pattern]\" if result.has_weekly_pattern else \"\"\n",
|
|
2293
2288
|
" print(f\" {col[:25]}: {best_lag_info}{weekly_info}\")\n",
|
|
2294
|
-
"
|
|
2289
|
+
"\n",
|
|
2295
2290
|
" # INTERPRETATION SECTION\n",
|
|
2296
2291
|
" print(\"\\n\" + \"─\"*70)\n",
|
|
2297
2292
|
" print(\"📖 INTERPRETATION\")\n",
|
|
@@ -2300,16 +2295,16 @@
|
|
|
2300
2295
|
" print(\" • r > 0.5: Strong memory - today predicts tomorrow well\")\n",
|
|
2301
2296
|
" print(\" • r 0.3-0.5: Moderate predictability from past\")\n",
|
|
2302
2297
|
" print(\" • r < 0.3: Weak autocorrelation - lag features less useful\\n\")\n",
|
|
2303
|
-
"
|
|
2298
|
+
"\n",
|
|
2304
2299
|
" interpretation_notes = feature_analyzer.generate_lag_interpretation(lag_results)\n",
|
|
2305
2300
|
" for note in interpretation_notes:\n",
|
|
2306
2301
|
" print(note)\n",
|
|
2307
|
-
"
|
|
2302
|
+
"\n",
|
|
2308
2303
|
" # RECOMMENDATIONS SECTION\n",
|
|
2309
2304
|
" print(\"\\n\" + \"─\"*70)\n",
|
|
2310
2305
|
" print(\"🎯 FEATURE RECOMMENDATIONS\")\n",
|
|
2311
2306
|
" print(\"─\"*70)\n",
|
|
2312
|
-
"
|
|
2307
|
+
"\n",
|
|
2313
2308
|
" lag_recs = feature_analyzer.generate_lag_recommendations(lag_results)\n",
|
|
2314
2309
|
" if lag_recs:\n",
|
|
2315
2310
|
" for rec in lag_recs:\n",
|
|
@@ -2322,7 +2317,7 @@
|
|
|
2322
2317
|
" print(\"\\nNo lag features recommended (no strong autocorrelation found).\")\n",
|
|
2323
2318
|
"\n",
|
|
2324
2319
|
"# Store lag recommendations for pattern_summary\n",
|
|
2325
|
-
"LAG_RECOMMENDATIONS = [{\"action\": r.action, \"source_column\": r.source_column
|
|
2320
|
+
"LAG_RECOMMENDATIONS = [{\"action\": r.action, \"source_column\": r.source_column,\n",
|
|
2326
2321
|
" \"description\": r.description, \"priority\": r.priority,\n",
|
|
2327
2322
|
" \"features\": [f\"{r.source_column}_lag_{r.params.get('lag_days', 7)}d\"],\n",
|
|
2328
2323
|
" \"params\": r.params}\n",
|
|
@@ -2393,56 +2388,56 @@
|
|
|
2393
2388
|
" print(\"=\"*70)\n",
|
|
2394
2389
|
" print(\"PREDICTIVE POWER ANALYSIS (IV & KS Statistics)\")\n",
|
|
2395
2390
|
" print(\"=\"*70)\n",
|
|
2396
|
-
"
|
|
2391
|
+
"\n",
|
|
2397
2392
|
" if 'feature_analyzer' not in dir():\n",
|
|
2398
2393
|
" feature_analyzer = TemporalFeatureAnalyzer(time_column=TIME_COLUMN, entity_column=ENTITY_COLUMN)\n",
|
|
2399
|
-
"
|
|
2394
|
+
"\n",
|
|
2400
2395
|
" analysis_cols = [c for c in sparkline_cols[:8] if c in df.columns]\n",
|
|
2401
|
-
"
|
|
2396
|
+
"\n",
|
|
2402
2397
|
" # Method handles aggregation to entity level internally\n",
|
|
2403
2398
|
" iv_results = feature_analyzer.calculate_predictive_power(df, analysis_cols, TARGET_COLUMN)\n",
|
|
2404
|
-
"
|
|
2399
|
+
"\n",
|
|
2405
2400
|
" n_retained = df.groupby(ENTITY_COLUMN)[TARGET_COLUMN].first().sum()\n",
|
|
2406
2401
|
" n_churned = df[ENTITY_COLUMN].nunique() - n_retained\n",
|
|
2407
2402
|
" print(f\"Analyzing {len(analysis_cols)} features\")\n",
|
|
2408
2403
|
" print(f\"Entities: {df[ENTITY_COLUMN].nunique():,} (Retained: {n_retained:,}, Churned: {n_churned:,})\")\n",
|
|
2409
|
-
"
|
|
2404
|
+
"\n",
|
|
2410
2405
|
" # Build visualization data\n",
|
|
2411
2406
|
" iv_data = {col: {\"iv\": r.information_value, \"ks\": r.ks_statistic} for col, r in iv_results.items()}\n",
|
|
2412
|
-
"
|
|
2407
|
+
"\n",
|
|
2413
2408
|
" # Create IV/KS comparison chart\n",
|
|
2414
2409
|
" import plotly.graph_objects as go\n",
|
|
2415
2410
|
" from plotly.subplots import make_subplots\n",
|
|
2416
|
-
"
|
|
2411
|
+
"\n",
|
|
2417
2412
|
" features = list(iv_data.keys())\n",
|
|
2418
2413
|
" ivs = [iv_data[f][\"iv\"] for f in features]\n",
|
|
2419
2414
|
" kss = [iv_data[f][\"ks\"] for f in features]\n",
|
|
2420
|
-
"
|
|
2415
|
+
"\n",
|
|
2421
2416
|
" # Sort by IV\n",
|
|
2422
2417
|
" sorted_idx = sorted(range(len(ivs)), key=lambda i: ivs[i], reverse=True)\n",
|
|
2423
2418
|
" features = [features[i] for i in sorted_idx]\n",
|
|
2424
2419
|
" ivs = [ivs[i] for i in sorted_idx]\n",
|
|
2425
2420
|
" kss = [kss[i] for i in sorted_idx]\n",
|
|
2426
|
-
"
|
|
2421
|
+
"\n",
|
|
2427
2422
|
" fig = make_subplots(rows=1, cols=2, subplot_titles=[\"Information Value (IV)\", \"KS Statistic\"])\n",
|
|
2428
|
-
"
|
|
2423
|
+
"\n",
|
|
2429
2424
|
" # IV bars with threshold colors\n",
|
|
2430
2425
|
" iv_colors = [\"#27ae60\" if v >= 0.1 else \"#f39c12\" if v >= 0.02 else \"#95a5a6\" for v in ivs]\n",
|
|
2431
|
-
" fig.add_trace(go.Bar(y=features, x=ivs, orientation=\"h\", marker_color=iv_colors
|
|
2426
|
+
" fig.add_trace(go.Bar(y=features, x=ivs, orientation=\"h\", marker_color=iv_colors,\n",
|
|
2432
2427
|
" showlegend=False, hovertemplate=\"%{y}: IV=%{x:.3f}<extra></extra>\"), row=1, col=1)\n",
|
|
2433
2428
|
" fig.add_vline(x=0.1, line_dash=\"dash\", line_color=\"#27ae60\", annotation_text=\"Strong\", row=1, col=1)\n",
|
|
2434
2429
|
" fig.add_vline(x=0.02, line_dash=\"dash\", line_color=\"#f39c12\", annotation_text=\"Medium\", row=1, col=1)\n",
|
|
2435
|
-
"
|
|
2430
|
+
"\n",
|
|
2436
2431
|
" # KS bars\n",
|
|
2437
2432
|
" ks_colors = [\"#3498db\" if v >= 0.3 else \"#85c1e9\" for v in kss]\n",
|
|
2438
2433
|
" fig.add_trace(go.Bar(y=features, x=kss, orientation=\"h\", marker_color=ks_colors,\n",
|
|
2439
2434
|
" showlegend=False, hovertemplate=\"%{y}: KS=%{x:.3f}<extra></extra>\"), row=1, col=2)\n",
|
|
2440
|
-
"
|
|
2435
|
+
"\n",
|
|
2441
2436
|
" fig.update_layout(height=400, title=\"Predictive Power: IV & KS Statistics\")\n",
|
|
2442
2437
|
" fig.update_xaxes(title_text=\"IV\", row=1, col=1)\n",
|
|
2443
2438
|
" fig.update_xaxes(title_text=\"KS\", row=1, col=2)\n",
|
|
2444
2439
|
" display_figure(fig)\n",
|
|
2445
|
-
"
|
|
2440
|
+
"\n",
|
|
2446
2441
|
" # Details table\n",
|
|
2447
2442
|
" print(\"\\n📊 Predictive Power Details:\")\n",
|
|
2448
2443
|
" print(f\"{'Feature':<25} {'IV':>8} {'IV Strength':<12} {'KS':>8} {'KS Strength':<12}\")\n",
|
|
@@ -2450,16 +2445,16 @@
|
|
|
2450
2445
|
" for col in features:\n",
|
|
2451
2446
|
" r = iv_results[col]\n",
|
|
2452
2447
|
" print(f\"{col[:24]:<25} {r.information_value:>8.3f} {r.iv_interpretation:<12} {r.ks_statistic:>8.3f} {r.ks_interpretation:<12}\")\n",
|
|
2453
|
-
"
|
|
2448
|
+
"\n",
|
|
2454
2449
|
" # INTERPRETATION with cross-references\n",
|
|
2455
2450
|
" print(\"\\n\" + \"─\"*70)\n",
|
|
2456
2451
|
" print(\"📖 INTERPRETATION\")\n",
|
|
2457
2452
|
" print(\"─\"*70)\n",
|
|
2458
|
-
"
|
|
2453
|
+
"\n",
|
|
2459
2454
|
" strong_iv = [col for col, r in iv_results.items() if r.information_value >= 0.1]\n",
|
|
2460
2455
|
" medium_iv = [col for col, r in iv_results.items() if 0.02 <= r.information_value < 0.1]\n",
|
|
2461
2456
|
" weak_iv = [col for col, r in iv_results.items() if r.information_value < 0.02]\n",
|
|
2462
|
-
"
|
|
2457
|
+
"\n",
|
|
2463
2458
|
" if strong_iv:\n",
|
|
2464
2459
|
" print(f\"\\nStrong predictors (IV >= 0.1): {', '.join(strong_iv)}\")\n",
|
|
2465
2460
|
" print(\" → High predictive value, prioritize in model\")\n",
|
|
@@ -2469,7 +2464,7 @@
|
|
|
2469
2464
|
" if weak_iv:\n",
|
|
2470
2465
|
" print(f\"\\nWeak predictors (IV < 0.02): {', '.join(weak_iv)}\")\n",
|
|
2471
2466
|
" print(\" → Limited predictive value alone\")\n",
|
|
2472
|
-
"
|
|
2467
|
+
"\n",
|
|
2473
2468
|
" # Cross-reference with 1c.10 Cohen's d if available\n",
|
|
2474
2469
|
" if 'effect_df' in dir() and len(effect_df) > 0:\n",
|
|
2475
2470
|
" print(\"\\n📎 Cross-reference with 1c.10 (Cohen's d):\")\n",
|
|
@@ -2486,24 +2481,24 @@
|
|
|
2486
2481
|
" print(f\" {col}: d={d_val:+.2f}, IV={iv_val.information_value:.3f} {agreement}\")\n",
|
|
2487
2482
|
" else:\n",
|
|
2488
2483
|
" print(\" (No overlapping features to compare)\")\n",
|
|
2489
|
-
"
|
|
2484
|
+
"\n",
|
|
2490
2485
|
" # RECOMMENDATIONS\n",
|
|
2491
2486
|
" print(\"\\n\" + \"─\"*70)\n",
|
|
2492
2487
|
" print(\"🎯 FEATURE RECOMMENDATIONS\")\n",
|
|
2493
2488
|
" print(\"─\"*70)\n",
|
|
2494
|
-
"
|
|
2489
|
+
"\n",
|
|
2495
2490
|
" if strong_iv:\n",
|
|
2496
2491
|
" print(\"\\n🔴 INCLUDE (Strong IV)\")\n",
|
|
2497
2492
|
" for col in strong_iv:\n",
|
|
2498
2493
|
" r = iv_results[col]\n",
|
|
2499
2494
|
" print(f\" • {col}: IV={r.information_value:.3f}, KS={r.ks_statistic:.3f}\")\n",
|
|
2500
|
-
"
|
|
2495
|
+
"\n",
|
|
2501
2496
|
" if medium_iv:\n",
|
|
2502
2497
|
" print(\"\\n🟡 INCLUDE (Medium IV)\")\n",
|
|
2503
2498
|
" for col in medium_iv[:5]:\n",
|
|
2504
2499
|
" r = iv_results[col]\n",
|
|
2505
2500
|
" print(f\" • {col}: IV={r.information_value:.3f}\")\n",
|
|
2506
|
-
"
|
|
2501
|
+
"\n",
|
|
2507
2502
|
" if not strong_iv and not medium_iv:\n",
|
|
2508
2503
|
" print(\"\\nNo features with strong predictive power found.\")\n",
|
|
2509
2504
|
" print(\" → See 1c.12 (velocity) and 1c.13 (momentum) for derived features\")\n",
|
|
@@ -2604,29 +2599,29 @@
|
|
|
2604
2599
|
" print(\"=\"*70)\n",
|
|
2605
2600
|
" print(\"CATEGORICAL FEATURE ANALYSIS\")\n",
|
|
2606
2601
|
" print(\"=\"*70)\n",
|
|
2607
|
-
"
|
|
2602
|
+
"\n",
|
|
2608
2603
|
" # Aggregate to entity level (take mode for categorical columns)\n",
|
|
2609
|
-
" cat_cols = [c for c in df.select_dtypes(include=['object', 'category']).columns
|
|
2604
|
+
" cat_cols = [c for c in df.select_dtypes(include=['object', 'category']).columns\n",
|
|
2610
2605
|
" if c not in [ENTITY_COLUMN, TIME_COLUMN, TARGET_COLUMN]]\n",
|
|
2611
|
-
"
|
|
2606
|
+
"\n",
|
|
2612
2607
|
" if cat_cols:\n",
|
|
2613
2608
|
" entity_cats_df = df.groupby(ENTITY_COLUMN).agg(\n",
|
|
2614
2609
|
" {c: lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else None for c in cat_cols}\n",
|
|
2615
2610
|
" ).reset_index()\n",
|
|
2616
2611
|
" entity_target = df.groupby(ENTITY_COLUMN)[TARGET_COLUMN].first().reset_index()\n",
|
|
2617
2612
|
" entity_data = entity_cats_df.merge(entity_target, on=ENTITY_COLUMN)\n",
|
|
2618
|
-
"
|
|
2613
|
+
"\n",
|
|
2619
2614
|
" cat_result = analyze_categorical_features(entity_data, ENTITY_COLUMN, TARGET_COLUMN)\n",
|
|
2620
|
-
"
|
|
2615
|
+
"\n",
|
|
2621
2616
|
" print(f\"Features analyzed: {len(cat_result.feature_insights)}\")\n",
|
|
2622
2617
|
" print(f\"Features filtered: {len(cat_result.filtered_columns)}\")\n",
|
|
2623
2618
|
" print(f\"Overall target rate: {cat_result.overall_target_rate:.1%}\")\n",
|
|
2624
|
-
"
|
|
2619
|
+
"\n",
|
|
2625
2620
|
" if cat_result.feature_insights:\n",
|
|
2626
2621
|
" # VISUALS\n",
|
|
2627
2622
|
" fig = charts.categorical_analysis_panel(cat_result.feature_insights, cat_result.overall_target_rate)\n",
|
|
2628
2623
|
" display_figure(fig)\n",
|
|
2629
|
-
"
|
|
2624
|
+
"\n",
|
|
2630
2625
|
" # DETAILS TABLE\n",
|
|
2631
2626
|
" print(\"\\n📊 Feature Details:\")\n",
|
|
2632
2627
|
" print(f\"{'Feature':<20} {'Cramér V':>10} {'Effect':>12} {'#Cats':>7} {'High Risk':>10} {'Low Risk':>10}\")\n",
|
|
@@ -2634,41 +2629,41 @@
|
|
|
2634
2629
|
" for insight in cat_result.feature_insights:\n",
|
|
2635
2630
|
" print(f\"{insight.feature_name[:19]:<20} {insight.cramers_v:>10.3f} {insight.effect_strength:>12} \"\n",
|
|
2636
2631
|
" f\"{insight.n_categories:>7} {len(insight.high_risk_categories):>10} {len(insight.low_risk_categories):>10}\")\n",
|
|
2637
|
-
"
|
|
2632
|
+
"\n",
|
|
2638
2633
|
" # INTERPRETATION\n",
|
|
2639
2634
|
" print(\"\\n\" + \"─\"*70)\n",
|
|
2640
2635
|
" print(\"📖 INTERPRETATION\")\n",
|
|
2641
2636
|
" print(\"─\"*70)\n",
|
|
2642
|
-
"
|
|
2637
|
+
"\n",
|
|
2643
2638
|
" strong = [i for i in cat_result.feature_insights if i.effect_strength == \"strong\"]\n",
|
|
2644
2639
|
" moderate = [i for i in cat_result.feature_insights if i.effect_strength == \"moderate\"]\n",
|
|
2645
2640
|
" weak = [i for i in cat_result.feature_insights if i.effect_strength in (\"weak\", \"negligible\")]\n",
|
|
2646
|
-
"
|
|
2641
|
+
"\n",
|
|
2647
2642
|
" if strong:\n",
|
|
2648
2643
|
" print(f\"\\nStrong predictors ({len(strong)}): {', '.join(i.feature_name for i in strong)}\")\n",
|
|
2649
2644
|
" print(\" → These features have clear category-target relationships\")\n",
|
|
2650
2645
|
" print(\" → Include in model, consider one-hot encoding\")\n",
|
|
2651
|
-
"
|
|
2646
|
+
"\n",
|
|
2652
2647
|
" if moderate:\n",
|
|
2653
2648
|
" print(f\"\\nModerate predictors ({len(moderate)}): {', '.join(i.feature_name for i in moderate)}\")\n",
|
|
2654
2649
|
" print(\" → Some predictive power, include if cardinality is reasonable\")\n",
|
|
2655
|
-
"
|
|
2650
|
+
"\n",
|
|
2656
2651
|
" if weak:\n",
|
|
2657
2652
|
" print(f\"\\nWeak/negligible ({len(weak)}): {', '.join(i.feature_name for i in weak)}\")\n",
|
|
2658
2653
|
" print(\" → Limited predictive value, may add noise\")\n",
|
|
2659
|
-
"
|
|
2654
|
+
"\n",
|
|
2660
2655
|
" # High-risk category insights\n",
|
|
2661
2656
|
" all_high_risk = [(i.feature_name, c) for i in cat_result.feature_insights for c in i.high_risk_categories[:2]]\n",
|
|
2662
2657
|
" if all_high_risk:\n",
|
|
2663
2658
|
" print(\"\\nHigh-risk segments (below-average retention):\")\n",
|
|
2664
2659
|
" for feat, cat in all_high_risk[:5]:\n",
|
|
2665
2660
|
" print(f\" • {feat} = '{cat}'\")\n",
|
|
2666
|
-
"
|
|
2661
|
+
"\n",
|
|
2667
2662
|
" # RECOMMENDATIONS\n",
|
|
2668
2663
|
" print(\"\\n\" + \"─\"*70)\n",
|
|
2669
2664
|
" print(\"🎯 FEATURE RECOMMENDATIONS\")\n",
|
|
2670
2665
|
" print(\"─\"*70)\n",
|
|
2671
|
-
"
|
|
2666
|
+
"\n",
|
|
2672
2667
|
" if cat_result.recommendations:\n",
|
|
2673
2668
|
" for rec in cat_result.recommendations:\n",
|
|
2674
2669
|
" priority_marker = \"🔴\" if rec.get('priority') == 'high' else \"🟡\"\n",
|
|
@@ -2680,13 +2675,13 @@
|
|
|
2680
2675
|
" print(\"\\n🔴 INCLUDE STRONG PREDICTORS\")\n",
|
|
2681
2676
|
" for i in strong:\n",
|
|
2682
2677
|
" print(f\" • {i.feature_name}: V={i.cramers_v:.3f}, {i.n_categories} categories\")\n",
|
|
2683
|
-
"
|
|
2678
|
+
"\n",
|
|
2684
2679
|
" if any(i.n_categories > 20 for i in cat_result.feature_insights):\n",
|
|
2685
2680
|
" high_card = [i for i in cat_result.feature_insights if i.n_categories > 20]\n",
|
|
2686
2681
|
" print(\"\\n🟡 HIGH CARDINALITY - CONSIDER GROUPING\")\n",
|
|
2687
2682
|
" for i in high_card:\n",
|
|
2688
2683
|
" print(f\" • {i.feature_name}: {i.n_categories} categories → group rare categories\")\n",
|
|
2689
|
-
"
|
|
2684
|
+
"\n",
|
|
2690
2685
|
" if not strong and not moderate:\n",
|
|
2691
2686
|
" print(\"\\nNo strong categorical predictors found.\")\n",
|
|
2692
2687
|
" print(\" • Consider creating derived features (e.g., category combinations)\")\n",
|
|
@@ -2824,12 +2819,12 @@
|
|
|
2824
2819
|
"print(f\" Velocity: {pattern_config.velocity_window_days}d | Momentum: {pattern_config.get_momentum_pairs()}\")\n",
|
|
2825
2820
|
"\n",
|
|
2826
2821
|
"# Trend summary\n",
|
|
2827
|
-
"print(
|
|
2822
|
+
"print(\"\\n📈 TREND:\")\n",
|
|
2828
2823
|
"print(f\" Direction: {trend_result.direction.value}\")\n",
|
|
2829
2824
|
"print(f\" Confidence: {trend_result.confidence}\")\n",
|
|
2830
2825
|
"\n",
|
|
2831
2826
|
"# Seasonality summary\n",
|
|
2832
|
-
"print(
|
|
2827
|
+
"print(\"\\n🔁 SEASONALITY:\")\n",
|
|
2833
2828
|
"if seasonality_results:\n",
|
|
2834
2829
|
" for sr in seasonality_results[:2]:\n",
|
|
2835
2830
|
" period_name = sr.period_name or f\"{sr.period}-day\"\n",
|
|
@@ -2839,7 +2834,7 @@
|
|
|
2839
2834
|
"\n",
|
|
2840
2835
|
"# Recency summary\n",
|
|
2841
2836
|
"if ENTITY_COLUMN:\n",
|
|
2842
|
-
" print(
|
|
2837
|
+
" print(\"\\n⏱️ RECENCY:\")\n",
|
|
2843
2838
|
" print(f\" Median: {recency_result.median_recency_days:.0f} days\")\n",
|
|
2844
2839
|
" if recency_result.target_correlation:\n",
|
|
2845
2840
|
" corr = recency_result.target_correlation\n",
|
|
@@ -2905,7 +2900,7 @@
|
|
|
2905
2900
|
"if seasonality_results:\n",
|
|
2906
2901
|
" weekly = any(6 <= sr.period <= 8 for sr in seasonality_results)\n",
|
|
2907
2902
|
" monthly = any(28 <= sr.period <= 32 for sr in seasonality_results)\n",
|
|
2908
|
-
"
|
|
2903
|
+
"\n",
|
|
2909
2904
|
" print(\"\\n2. SEASONALITY FEATURES:\")\n",
|
|
2910
2905
|
" if weekly:\n",
|
|
2911
2906
|
" print(\" - is_weekend (binary)\")\n",
|
|
@@ -2993,7 +2988,7 @@
|
|
|
2993
2988
|
"if seasonality_results:\n",
|
|
2994
2989
|
" strong_patterns = [sr for sr in seasonality_results if sr.strength > 0.5]\n",
|
|
2995
2990
|
" moderate_patterns = [sr for sr in seasonality_results if 0.3 < sr.strength <= 0.5]\n",
|
|
2996
|
-
"
|
|
2991
|
+
"\n",
|
|
2997
2992
|
" for sr in seasonality_results:\n",
|
|
2998
2993
|
" if sr.period == 7:\n",
|
|
2999
2994
|
" seasonality_recs.append({\n",
|
|
@@ -3010,13 +3005,13 @@
|
|
|
3010
3005
|
" \"action\": \"add_cyclical_feature\", \"feature\": \"quarter\", \"encoding\": \"sin_cos\",\n",
|
|
3011
3006
|
" \"reason\": f\"Quarterly pattern detected (strength={sr.strength:.2f})\"\n",
|
|
3012
3007
|
" })\n",
|
|
3013
|
-
"
|
|
3008
|
+
"\n",
|
|
3014
3009
|
" if strong_patterns:\n",
|
|
3015
3010
|
" seasonality_recs.append({\n",
|
|
3016
3011
|
" \"action\": \"consider_deseasonalization\", \"periods\": [sr.period for sr in strong_patterns],\n",
|
|
3017
3012
|
" \"reason\": \"Strong seasonal patterns may dominate signal\"\n",
|
|
3018
3013
|
" })\n",
|
|
3019
|
-
"
|
|
3014
|
+
"\n",
|
|
3020
3015
|
" if 'window_lags' in dir() and window_lags:\n",
|
|
3021
3016
|
" aligned = [sr for sr in seasonality_results if sr.period in window_lags]\n",
|
|
3022
3017
|
" if aligned:\n",
|
|
@@ -3119,8 +3114,8 @@
|
|
|
3119
3114
|
"# These flags tell 01d which optional features to include based on analysis results\n",
|
|
3120
3115
|
"pattern_summary[\"feature_flags\"] = {\n",
|
|
3121
3116
|
" \"include_recency\": (\n",
|
|
3122
|
-
" recency_comparison.cohens_d > 0.2
|
|
3123
|
-
" if 'recency_comparison' in dir() and recency_comparison
|
|
3117
|
+
" recency_comparison.cohens_d > 0.2\n",
|
|
3118
|
+
" if 'recency_comparison' in dir() and recency_comparison\n",
|
|
3124
3119
|
" else True\n",
|
|
3125
3120
|
" ),\n",
|
|
3126
3121
|
" \"include_tenure\": True, # Default on; could be derived from tenure analysis if available\n",
|
|
@@ -3161,7 +3156,7 @@
|
|
|
3161
3156
|
" if skip_recs:\n",
|
|
3162
3157
|
" print(f\"\\n👥 COHORT: Skip cohort features ({skip_recs[0]['reason']})\")\n",
|
|
3163
3158
|
" elif feature_recs:\n",
|
|
3164
|
-
" print(
|
|
3159
|
+
" print(\"\\n👥 COHORT FEATURES TO ADD:\")\n",
|
|
3165
3160
|
" for rec in feature_recs:\n",
|
|
3166
3161
|
" print(f\" • {', '.join(rec['features'])} ({rec['priority']} priority)\")\n",
|
|
3167
3162
|
"\n",
|