churnkit 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +10 -5
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +11 -9
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +52 -46
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +68 -65
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +12 -27
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +216 -221
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +88 -81
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +111 -108
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +44 -38
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +89 -85
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +81 -80
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +83 -89
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +102 -98
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +32 -31
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +33 -29
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +6 -5
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +67 -63
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +38 -23
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +3 -1
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/METADATA +1 -1
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/RECORD +31 -31
- customer_retention/__init__.py +1 -1
- customer_retention/analysis/auto_explorer/explorer.py +2 -2
- customer_retention/analysis/notebook_progress.py +14 -2
- customer_retention/core/compat/__init__.py +10 -0
- customer_retention/core/config/experiments.py +45 -0
- customer_retention/integrations/databricks_init.py +41 -1
- customer_retention/stages/profiling/column_profiler.py +9 -2
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/WHEEL +0 -0
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/entry_points.txt +0 -0
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -96,22 +96,26 @@
|
|
|
96
96
|
"outputs": [],
|
|
97
97
|
"source": [
|
|
98
98
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
99
|
+
"\n",
|
|
99
100
|
"track_and_export_previous(\"01a_temporal_deep_dive.ipynb\")\n",
|
|
100
101
|
"\n",
|
|
102
|
+
"import numpy as np\n",
|
|
103
|
+
"import pandas as pd\n",
|
|
104
|
+
"import plotly.graph_objects as go\n",
|
|
105
|
+
"from plotly.subplots import make_subplots\n",
|
|
106
|
+
"\n",
|
|
101
107
|
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
102
108
|
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
103
109
|
"from customer_retention.core.config.column_config import ColumnType, DatasetGranularity\n",
|
|
110
|
+
"from customer_retention.core.config.experiments import (\n",
|
|
111
|
+
" FINDINGS_DIR,\n",
|
|
112
|
+
")\n",
|
|
104
113
|
"from customer_retention.stages.profiling import (\n",
|
|
105
|
-
"
|
|
114
|
+
" DistributionAnalyzer,\n",
|
|
115
|
+
" TimeSeriesProfiler,\n",
|
|
116
|
+
" TransformationType,\n",
|
|
106
117
|
" TypeDetector,\n",
|
|
107
|
-
"
|
|
108
|
-
")\n",
|
|
109
|
-
"import pandas as pd\n",
|
|
110
|
-
"import numpy as np\n",
|
|
111
|
-
"import plotly.graph_objects as go\n",
|
|
112
|
-
"import plotly.express as px\n",
|
|
113
|
-
"from plotly.subplots import make_subplots\n",
|
|
114
|
-
"from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
|
|
118
|
+
")\n"
|
|
115
119
|
]
|
|
116
120
|
},
|
|
117
121
|
{
|
|
@@ -141,7 +145,6 @@
|
|
|
141
145
|
"# FINDINGS_PATH = \"../experiments/findings/transactions_abc123_findings.yaml\"\n",
|
|
142
146
|
"\n",
|
|
143
147
|
"# Option 2: Auto-discover findings files\n",
|
|
144
|
-
"from pathlib import Path\n",
|
|
145
148
|
"\n",
|
|
146
149
|
"# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
|
|
147
150
|
"\n",
|
|
@@ -238,7 +241,7 @@
|
|
|
238
241
|
},
|
|
239
242
|
"outputs": [],
|
|
240
243
|
"source": [
|
|
241
|
-
"from customer_retention.stages.temporal import
|
|
244
|
+
"from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
|
|
242
245
|
"\n",
|
|
243
246
|
"df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
|
|
244
247
|
"charts = ChartBuilder()\n",
|
|
@@ -280,7 +283,7 @@
|
|
|
280
283
|
" # Manual configuration - uncomment and set if auto-detection failed\n",
|
|
281
284
|
" # ENTITY_COLUMN = \"customer_id\"\n",
|
|
282
285
|
" # TIME_COLUMN = \"event_date\"\n",
|
|
283
|
-
"
|
|
286
|
+
"\n",
|
|
284
287
|
" # Try auto-detection\n",
|
|
285
288
|
" detector = TypeDetector()\n",
|
|
286
289
|
" granularity = detector.detect_granularity(df)\n",
|
|
@@ -346,17 +349,17 @@
|
|
|
346
349
|
"print(\"=\"*70)\n",
|
|
347
350
|
"print(\"TIME SERIES PROFILE SUMMARY\")\n",
|
|
348
351
|
"print(\"=\"*70)\n",
|
|
349
|
-
"print(
|
|
352
|
+
"print(\"\\n\\U0001f4ca Dataset Overview:\")\n",
|
|
350
353
|
"print(f\" Total Events: {ts_profile.total_events:,}\")\n",
|
|
351
354
|
"print(f\" Unique Entities: {ts_profile.unique_entities:,}\")\n",
|
|
352
355
|
"print(f\" Avg Events/Entity: {ts_profile.events_per_entity.mean:.1f}\")\n",
|
|
353
356
|
"print(f\" Time Span: {ts_profile.time_span_days:,} days ({ts_profile.time_span_days/365:.1f} years)\")\n",
|
|
354
357
|
"\n",
|
|
355
|
-
"print(
|
|
358
|
+
"print(\"\\n\\U0001f4c5 Date Range:\")\n",
|
|
356
359
|
"print(f\" First Event: {ts_profile.first_event_date}\")\n",
|
|
357
360
|
"print(f\" Last Event: {ts_profile.last_event_date}\")\n",
|
|
358
361
|
"\n",
|
|
359
|
-
"print(
|
|
362
|
+
"print(\"\\n\\u23f1\\ufe0f Inter-Event Timing:\")\n",
|
|
360
363
|
"if ts_profile.avg_inter_event_days is not None:\n",
|
|
361
364
|
" print(f\" Avg Days Between Events: {ts_profile.avg_inter_event_days:.1f}\")\n",
|
|
362
365
|
"else:\n",
|
|
@@ -791,7 +794,7 @@
|
|
|
791
794
|
},
|
|
792
795
|
"outputs": [],
|
|
793
796
|
"source": [
|
|
794
|
-
"print(
|
|
797
|
+
"print(\"Coverage Summary:\")\n",
|
|
795
798
|
"print(f\" Time span: {coverage_result.time_span_days:,} days \"\n",
|
|
796
799
|
" f\"({coverage_result.first_event.strftime('%Y-%m-%d')} to {coverage_result.last_event.strftime('%Y-%m-%d')})\")\n",
|
|
797
800
|
"print(f\" Volume trend: {coverage_result.volume_trend} ({coverage_result.volume_change_pct:+.0%})\")\n",
|
|
@@ -799,11 +802,11 @@
|
|
|
799
802
|
" + (f\" ({sum(g.duration_days for g in coverage_result.gaps):.0f} total days)\" if coverage_result.gaps else \"\"))\n",
|
|
800
803
|
"\n",
|
|
801
804
|
"if coverage_result.recommendations:\n",
|
|
802
|
-
" print(
|
|
805
|
+
" print(\"\\nRecommendations:\")\n",
|
|
803
806
|
" for rec in coverage_result.recommendations:\n",
|
|
804
807
|
" print(f\" -> {rec}\")\n",
|
|
805
808
|
"else:\n",
|
|
806
|
-
" print(
|
|
809
|
+
" print(\"\\nNo coverage issues detected — data is suitable for all candidate windows.\")"
|
|
807
810
|
]
|
|
808
811
|
},
|
|
809
812
|
{
|
|
@@ -843,7 +846,7 @@
|
|
|
843
846
|
"if drift.recommended_training_start:\n",
|
|
844
847
|
" print(f\" Recommended training start: {drift.recommended_training_start.strftime('%Y-%m-%d')}\")\n",
|
|
845
848
|
"\n",
|
|
846
|
-
"print(
|
|
849
|
+
"print(\"\\nRationale:\")\n",
|
|
847
850
|
"for r in drift.rationale:\n",
|
|
848
851
|
" print(f\" -> {r}\")"
|
|
849
852
|
]
|
|
@@ -904,7 +907,7 @@
|
|
|
904
907
|
"\n",
|
|
905
908
|
"if inter_event_times:\n",
|
|
906
909
|
" inter_event_series = pd.Series(inter_event_times)\n",
|
|
907
|
-
"
|
|
910
|
+
"\n",
|
|
908
911
|
" print(\"\\u23f1\\ufe0f Inter-Event Time Distribution (days):\")\n",
|
|
909
912
|
" print(f\" Min: {inter_event_series.min():.2f}\")\n",
|
|
910
913
|
" print(f\" 25th percentile: {inter_event_series.quantile(0.25):.2f}\")\n",
|
|
@@ -912,14 +915,14 @@
|
|
|
912
915
|
" print(f\" Mean: {inter_event_series.mean():.2f}\")\n",
|
|
913
916
|
" print(f\" 75th percentile: {inter_event_series.quantile(0.75):.2f}\")\n",
|
|
914
917
|
" print(f\" Max: {inter_event_series.max():.2f}\")\n",
|
|
915
|
-
"
|
|
918
|
+
"\n",
|
|
916
919
|
" # Histogram\n",
|
|
917
920
|
" fig = go.Figure()\n",
|
|
918
|
-
"
|
|
921
|
+
"\n",
|
|
919
922
|
" # Cap at 99th percentile for visualization\n",
|
|
920
923
|
" cap = inter_event_series.quantile(0.99)\n",
|
|
921
924
|
" display_data = inter_event_series[inter_event_series <= cap]\n",
|
|
922
|
-
"
|
|
925
|
+
"\n",
|
|
923
926
|
" fig.add_trace(go.Histogram(\n",
|
|
924
927
|
" x=display_data,\n",
|
|
925
928
|
" nbinsx=50,\n",
|
|
@@ -927,11 +930,11 @@
|
|
|
927
930
|
" marker_color=\"coral\",\n",
|
|
928
931
|
" opacity=0.7\n",
|
|
929
932
|
" ))\n",
|
|
930
|
-
"
|
|
933
|
+
"\n",
|
|
931
934
|
" fig.add_vline(x=inter_event_series.median(), line_dash=\"solid\", line_color=\"green\",\n",
|
|
932
935
|
" annotation_text=f\"Median: {inter_event_series.median():.1f} days\",\n",
|
|
933
936
|
" annotation_position=\"top right\")\n",
|
|
934
|
-
"
|
|
937
|
+
"\n",
|
|
935
938
|
" fig.update_layout(\n",
|
|
936
939
|
" title=f\"Inter-Event Time Distribution (capped at {cap:.0f} days = 99th percentile)\",\n",
|
|
937
940
|
" xaxis_title=\"Days Between Events\",\n",
|
|
@@ -978,7 +981,7 @@
|
|
|
978
981
|
" if skew_ratio > 1.5:\n",
|
|
979
982
|
" print(f\" Distribution is heavily right-skewed (mean/median = {skew_ratio:.2f})\")\n",
|
|
980
983
|
" print(f\" -> Most entities engage frequently (median {median_iet:.0f}d between events)\")\n",
|
|
981
|
-
" print(
|
|
984
|
+
" print(\" -> A long tail of entities has very infrequent engagement\")\n",
|
|
982
985
|
" elif skew_ratio > 1.2:\n",
|
|
983
986
|
" print(f\" Distribution is moderately right-skewed (mean/median = {skew_ratio:.2f})\")\n",
|
|
984
987
|
" print(f\" -> Typical engagement every {median_iet:.0f} days, with some long gaps\")\n",
|
|
@@ -988,11 +991,11 @@
|
|
|
988
991
|
"\n",
|
|
989
992
|
" print(f\"\\n Spread: IQR = {iqr:.0f} days (Q25={q25:.0f}d to Q75={q75:.0f}d)\")\n",
|
|
990
993
|
" if iqr > median_iet:\n",
|
|
991
|
-
" print(
|
|
994
|
+
" print(\" -> High variability (IQR > median) — entities have inconsistent timing\")\n",
|
|
992
995
|
" else:\n",
|
|
993
|
-
" print(
|
|
996
|
+
" print(\" -> Moderate variability — most entities follow a similar cadence\")\n",
|
|
994
997
|
"\n",
|
|
995
|
-
" print(
|
|
998
|
+
" print(\"\\nRecommendations:\")\n",
|
|
996
999
|
" # Window alignment\n",
|
|
997
1000
|
" window_map = [(1, \"24h\"), (7, \"7d\"), (14, \"14d\"), (30, \"30d\"),\n",
|
|
998
1001
|
" (90, \"90d\"), (180, \"180d\"), (365, \"365d\")]\n",
|
|
@@ -1000,7 +1003,7 @@
|
|
|
1000
1003
|
" if aligned:\n",
|
|
1001
1004
|
" aligned_str = \", \".join(w for _, w in aligned)\n",
|
|
1002
1005
|
" print(f\" -> Windows aligned with median inter-event time: {aligned_str}\")\n",
|
|
1003
|
-
" print(
|
|
1006
|
+
" print(\" These capture ~2 events per entity on average\")\n",
|
|
1004
1007
|
" else:\n",
|
|
1005
1008
|
" print(f\" -> Median inter-event ({median_iet:.0f}d) does not align with standard windows\")\n",
|
|
1006
1009
|
"\n",
|
|
@@ -1010,11 +1013,11 @@
|
|
|
1010
1013
|
" print(f\" -> 30d window captures only ~{events_in_30d:.1f} events/entity — \"\n",
|
|
1011
1014
|
" f\"consider longer windows (90d+) for meaningful aggregations\")\n",
|
|
1012
1015
|
" if median_iet < 7:\n",
|
|
1013
|
-
" print(
|
|
1016
|
+
" print(\" -> High frequency engagement — 7d and 24h windows will be rich with signal\")\n",
|
|
1014
1017
|
"\n",
|
|
1015
1018
|
" if skew_ratio > 1.5:\n",
|
|
1016
|
-
" print(
|
|
1017
|
-
"
|
|
1019
|
+
" print(\" -> Consider log-transforming inter-event time as a feature \"\n",
|
|
1020
|
+
" \"(reduces right-skew impact on models)\")\n"
|
|
1018
1021
|
]
|
|
1019
1022
|
},
|
|
1020
1023
|
{
|
|
@@ -1061,13 +1064,13 @@
|
|
|
1061
1064
|
"# Use framework's DistributionAnalyzer for comprehensive analysis\n",
|
|
1062
1065
|
"analyzer = DistributionAnalyzer()\n",
|
|
1063
1066
|
"\n",
|
|
1064
|
-
"numeric_cols = [n for n, c in findings.columns.items()
|
|
1067
|
+
"numeric_cols = [n for n, c in findings.columns.items()\n",
|
|
1065
1068
|
" if c.inferred_type.value in ('numeric_continuous', 'numeric_discrete')\n",
|
|
1066
1069
|
" and n not in [ENTITY_COLUMN, TIME_COLUMN] and n not in TEMPORAL_METADATA_COLS]\n",
|
|
1067
1070
|
"\n",
|
|
1068
1071
|
"# Analyze all numeric columns using the framework\n",
|
|
1069
1072
|
"analyses = analyzer.analyze_dataframe(df, numeric_cols)\n",
|
|
1070
|
-
"recommendations = {col: analyzer.recommend_transformation(analysis)
|
|
1073
|
+
"recommendations = {col: analyzer.recommend_transformation(analysis)\n",
|
|
1071
1074
|
" for col, analysis in analyses.items()}\n",
|
|
1072
1075
|
"\n",
|
|
1073
1076
|
"# Human-readable transformation names\n",
|
|
@@ -1092,25 +1095,25 @@
|
|
|
1092
1095
|
" col_info = findings.columns[col_name]\n",
|
|
1093
1096
|
" analysis = analyses.get(col_name)\n",
|
|
1094
1097
|
" rec = recommendations.get(col_name)\n",
|
|
1095
|
-
"
|
|
1098
|
+
"\n",
|
|
1096
1099
|
" print(f\"\\n{'='*70}\")\n",
|
|
1097
1100
|
" print(f\"Column: {col_name}\")\n",
|
|
1098
1101
|
" print(f\"Type: {col_info.inferred_type.value} (Confidence: {col_info.confidence:.0%})\")\n",
|
|
1099
|
-
" print(
|
|
1100
|
-
"
|
|
1102
|
+
" print(\"-\" * 70)\n",
|
|
1103
|
+
"\n",
|
|
1101
1104
|
" if analysis:\n",
|
|
1102
|
-
" print(
|
|
1105
|
+
" print(\"📊 Distribution Statistics:\")\n",
|
|
1103
1106
|
" print(f\" Mean: {analysis.mean:.3f} | Median: {analysis.median:.3f} | Std: {analysis.std:.3f}\")\n",
|
|
1104
1107
|
" print(f\" Range: [{analysis.min_value:.3f}, {analysis.max_value:.3f}]\")\n",
|
|
1105
1108
|
" print(f\" Percentiles: 1%={analysis.percentiles['p1']:.3f}, 25%={analysis.q1:.3f}, 75%={analysis.q3:.3f}, 99%={analysis.percentiles['p99']:.3f}\")\n",
|
|
1106
|
-
" print(
|
|
1109
|
+
" print(\"\\n📈 Shape Analysis:\")\n",
|
|
1107
1110
|
" skew_label = '(Right-skewed)' if analysis.skewness > 0.5 else '(Left-skewed)' if analysis.skewness < -0.5 else '(Symmetric)'\n",
|
|
1108
1111
|
" print(f\" Skewness: {analysis.skewness:.2f} {skew_label}\")\n",
|
|
1109
1112
|
" kurt_label = '(Heavy tails/outliers)' if analysis.kurtosis > 3 else '(Light tails)'\n",
|
|
1110
1113
|
" print(f\" Kurtosis: {analysis.kurtosis:.2f} {kurt_label}\")\n",
|
|
1111
1114
|
" print(f\" Zeros: {analysis.zero_count:,} ({analysis.zero_percentage:.1f}%)\")\n",
|
|
1112
1115
|
" print(f\" Outliers (IQR): {analysis.outlier_count_iqr:,} ({analysis.outlier_percentage:.1f}%)\")\n",
|
|
1113
|
-
"
|
|
1116
|
+
"\n",
|
|
1114
1117
|
" if rec:\n",
|
|
1115
1118
|
" transform_display = TRANSFORM_DISPLAY_NAMES.get(rec.recommended_transform.value, rec.recommended_transform.value)\n",
|
|
1116
1119
|
" print(f\"\\n🔧 Recommended Transformation: {transform_display}\")\n",
|
|
@@ -1149,20 +1152,20 @@
|
|
|
1149
1152
|
" rec = recommendations.get(col_name)\n",
|
|
1150
1153
|
" if not analysis:\n",
|
|
1151
1154
|
" continue\n",
|
|
1152
|
-
"
|
|
1155
|
+
"\n",
|
|
1153
1156
|
" data = df[col_name].dropna()\n",
|
|
1154
1157
|
" fig = go.Figure()\n",
|
|
1155
|
-
"
|
|
1158
|
+
"\n",
|
|
1156
1159
|
" fig.add_trace(go.Histogram(x=data, nbinsx=50, name='Distribution',\n",
|
|
1157
1160
|
" marker_color='steelblue', opacity=0.7))\n",
|
|
1158
|
-
"
|
|
1161
|
+
"\n",
|
|
1159
1162
|
" mean_val = data.mean()\n",
|
|
1160
1163
|
" median_val = data.median()\n",
|
|
1161
|
-
"
|
|
1164
|
+
"\n",
|
|
1162
1165
|
" # Position labels on opposite sides to avoid overlap\n",
|
|
1163
1166
|
" mean_position = \"top right\" if mean_val >= median_val else \"top left\"\n",
|
|
1164
1167
|
" median_position = \"top left\" if mean_val >= median_val else \"top right\"\n",
|
|
1165
|
-
"
|
|
1168
|
+
"\n",
|
|
1166
1169
|
" fig.add_vline(\n",
|
|
1167
1170
|
" x=mean_val, line_dash=\"dash\", line_color=\"red\",\n",
|
|
1168
1171
|
" annotation_text=f\"Mean: {mean_val:.2f}\",\n",
|
|
@@ -1170,7 +1173,7 @@
|
|
|
1170
1173
|
" annotation_font_color=\"red\",\n",
|
|
1171
1174
|
" annotation_bgcolor=\"rgba(255,255,255,0.8)\"\n",
|
|
1172
1175
|
" )\n",
|
|
1173
|
-
"
|
|
1176
|
+
"\n",
|
|
1174
1177
|
" fig.add_vline(\n",
|
|
1175
1178
|
" x=median_val, line_dash=\"solid\", line_color=\"green\",\n",
|
|
1176
1179
|
" annotation_text=f\"Median: {median_val:.2f}\",\n",
|
|
@@ -1178,7 +1181,7 @@
|
|
|
1178
1181
|
" annotation_font_color=\"green\",\n",
|
|
1179
1182
|
" annotation_bgcolor=\"rgba(255,255,255,0.8)\"\n",
|
|
1180
1183
|
" )\n",
|
|
1181
|
-
"
|
|
1184
|
+
"\n",
|
|
1182
1185
|
" # Add 99th percentile marker if there are outliers\n",
|
|
1183
1186
|
" if analysis.outlier_percentage > 5:\n",
|
|
1184
1187
|
" fig.add_vline(x=analysis.percentiles['p99'], line_dash=\"dot\", line_color=\"orange\",\n",
|
|
@@ -1186,7 +1189,7 @@
|
|
|
1186
1189
|
" annotation_position=\"top right\",\n",
|
|
1187
1190
|
" annotation_font_color=\"orange\",\n",
|
|
1188
1191
|
" annotation_bgcolor=\"rgba(255,255,255,0.8)\")\n",
|
|
1189
|
-
"
|
|
1192
|
+
"\n",
|
|
1190
1193
|
" transform_key = rec.recommended_transform.value if rec else \"none\"\n",
|
|
1191
1194
|
" transform_label = TRANSFORM_DISPLAY_NAMES.get(transform_key, transform_key)\n",
|
|
1192
1195
|
" fig.update_layout(\n",
|
|
@@ -1233,12 +1236,12 @@
|
|
|
1233
1236
|
"for col_name in categorical_cols:\n",
|
|
1234
1237
|
" col_info = findings.columns[col_name]\n",
|
|
1235
1238
|
" cardinality = col_info.universal_metrics.get('distinct_count', df[col_name].nunique())\n",
|
|
1236
|
-
"
|
|
1239
|
+
"\n",
|
|
1237
1240
|
" print(f\"\\n{'='*50}\")\n",
|
|
1238
1241
|
" print(f\"Column: {col_name}\")\n",
|
|
1239
1242
|
" print(f\"Type: {col_info.inferred_type.value} (Confidence: {col_info.confidence:.0%})\")\n",
|
|
1240
1243
|
" print(f\"Distinct Values: {cardinality}\")\n",
|
|
1241
|
-
"
|
|
1244
|
+
"\n",
|
|
1242
1245
|
" # Encoding recommendation based on type and cardinality\n",
|
|
1243
1246
|
" if col_info.inferred_type.value == 'categorical_cyclical':\n",
|
|
1244
1247
|
" encoding_rec = \"Sin/Cos encoding (cyclical)\"\n",
|
|
@@ -1249,7 +1252,7 @@
|
|
|
1249
1252
|
" else:\n",
|
|
1250
1253
|
" encoding_rec = \"Target encoding or Frequency encoding (high cardinality)\"\n",
|
|
1251
1254
|
" print(f\"Recommended Encoding: {encoding_rec}\")\n",
|
|
1252
|
-
"
|
|
1255
|
+
"\n",
|
|
1253
1256
|
" # Value counts visualization\n",
|
|
1254
1257
|
" value_counts = df[col_name].value_counts().head(10)\n",
|
|
1255
1258
|
" fig = charts.bar_chart(value_counts.index.tolist(), value_counts.values.tolist(),\n",
|
|
@@ -1314,7 +1317,7 @@
|
|
|
1314
1317
|
" # Sort by priority\n",
|
|
1315
1318
|
" priority_order = {'high': 0, 'medium': 1, 'low': 2}\n",
|
|
1316
1319
|
" transformations.sort(key=lambda x: priority_order.get(x['priority'], 3))\n",
|
|
1317
|
-
"
|
|
1320
|
+
"\n",
|
|
1318
1321
|
" for t in transformations:\n",
|
|
1319
1322
|
" priority_marker = \"🔴\" if t['priority'] == 'high' else \"🟡\" if t['priority'] == 'medium' else \"🟢\"\n",
|
|
1320
1323
|
" print(f\"\\n {priority_marker} {t['column']}: {t['transform']}\")\n",
|
|
@@ -1352,7 +1355,7 @@
|
|
|
1352
1355
|
" print(\"TEMPORAL AGGREGATION PERSPECTIVE\")\n",
|
|
1353
1356
|
" print(\"=\"*70)\n",
|
|
1354
1357
|
" print(f\"\\nMedian inter-event time: {median_iet:.0f} days\")\n",
|
|
1355
|
-
" print(
|
|
1358
|
+
" print(\"Expected events per window (at median cadence):\")\n",
|
|
1356
1359
|
" windows_days = [(\"7d\", 7), (\"30d\", 30), (\"90d\", 90), (\"180d\", 180), (\"365d\", 365)]\n",
|
|
1357
1360
|
" for label, days in windows_days:\n",
|
|
1358
1361
|
" expected = days / median_iet if median_iet > 0 else 0\n",
|
|
@@ -1360,7 +1363,7 @@
|
|
|
1360
1363
|
" print(f\" {marker} {label}: ~{expected:.1f} events/entity\")\n",
|
|
1361
1364
|
"\n",
|
|
1362
1365
|
" # Within-entity vs between-entity variance per column\n",
|
|
1363
|
-
" print(
|
|
1366
|
+
" print(\"\\nColumn Temporal Variability (within-entity CV vs between-entity CV):\")\n",
|
|
1364
1367
|
" print(f\"{'Column':<25} {'Within-CV':<12} {'Between-CV':<12} {'Ratio':<8} {'Aggregation Guidance'}\")\n",
|
|
1365
1368
|
" print(\"-\" * 90)\n",
|
|
1366
1369
|
"\n",
|
|
@@ -1390,11 +1393,11 @@
|
|
|
1390
1393
|
" ratio_str = f\"{ratio:.2f}\" if not np.isinf(ratio) else \">10\"\n",
|
|
1391
1394
|
" print(f\"{col:<25} {within_str:<12} {between_cv:<12.2f} {ratio_str:<8} {guidance}\")\n",
|
|
1392
1395
|
"\n",
|
|
1393
|
-
" print(
|
|
1394
|
-
" print(
|
|
1395
|
-
" print(
|
|
1396
|
-
" print(
|
|
1397
|
-
" print(
|
|
1396
|
+
" print(\"\\nInterpretation:\")\n",
|
|
1397
|
+
" print(\" Within-CV: how much each entity\\'s values vary across their events\")\n",
|
|
1398
|
+
" print(\" Between-CV: how much entity averages differ from each other\")\n",
|
|
1399
|
+
" print(\" Ratio > 1: temporal variation dominates -> shorter windows capture dynamics\")\n",
|
|
1400
|
+
" print(\" Ratio < 1: entity identity dominates -> longer windows (or all_time) sufficient\")\n"
|
|
1398
1401
|
]
|
|
1399
1402
|
},
|
|
1400
1403
|
{
|
|
@@ -1466,8 +1469,8 @@
|
|
|
1466
1469
|
"explanation[\"meaningful_pct\"] = (explanation[\"meaningful_pct\"] * 100).round(1).astype(str) + \"%\"\n",
|
|
1467
1470
|
"display_table(explanation)\n",
|
|
1468
1471
|
"\n",
|
|
1469
|
-
"print(
|
|
1470
|
-
"print(
|
|
1472
|
+
"print(\"\\nCoverage: % of entities with enough tenure AND expected >=2 events in that window\")\n",
|
|
1473
|
+
"print(\"Meaningful: among entities with enough tenure, % that have sufficient event density\")"
|
|
1471
1474
|
]
|
|
1472
1475
|
},
|
|
1473
1476
|
{
|
|
@@ -1495,9 +1498,9 @@
|
|
|
1495
1498
|
"h = window_result.heterogeneity\n",
|
|
1496
1499
|
"\n",
|
|
1497
1500
|
"print(\"Temporal Heterogeneity (eta-squared):\")\n",
|
|
1498
|
-
"print(
|
|
1499
|
-
"print(
|
|
1500
|
-
"print(
|
|
1501
|
+
"print(\" eta² measures the fraction of variance in a metric explained by lifecycle quadrant grouping.\")\n",
|
|
1502
|
+
"print(\" Scale: 0 = no group differences, 1 = all variance is between groups.\")\n",
|
|
1503
|
+
"print(\" Thresholds: <0.06 = low | 0.06-0.14 = moderate | >0.14 = high effect size\\n\")\n",
|
|
1501
1504
|
"\n",
|
|
1502
1505
|
"eta_max = max(h.eta_squared_intensity, h.eta_squared_event_count)\n",
|
|
1503
1506
|
"print(f\" Intensity eta²: {h.eta_squared_intensity:.3f} {'<-- dominant' if h.eta_squared_intensity >= h.eta_squared_event_count else ''}\")\n",
|
|
@@ -71,22 +71,25 @@
|
|
|
71
71
|
"outputs": [],
|
|
72
72
|
"source": [
|
|
73
73
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
74
|
+
"\n",
|
|
74
75
|
"track_and_export_previous(\"01b_temporal_quality.ipynb\")\n",
|
|
75
76
|
"\n",
|
|
76
|
-
"from pathlib import Path\n",
|
|
77
77
|
"import pandas as pd\n",
|
|
78
78
|
"import plotly.graph_objects as go\n",
|
|
79
|
-
"from plotly.subplots import make_subplots\n",
|
|
80
79
|
"\n",
|
|
81
80
|
"from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationEngine\n",
|
|
82
81
|
"from customer_retention.analysis.visualization import ChartBuilder, display_figure\n",
|
|
83
82
|
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
83
|
+
"from customer_retention.core.config.experiments import FINDINGS_DIR\n",
|
|
84
84
|
"from customer_retention.stages.profiling import (\n",
|
|
85
|
-
" DuplicateEventCheck
|
|
86
|
-
"
|
|
85
|
+
" DuplicateEventCheck,\n",
|
|
86
|
+
" EventOrderCheck,\n",
|
|
87
|
+
" FutureDateCheck,\n",
|
|
88
|
+
" SegmentAwareOutlierAnalyzer,\n",
|
|
89
|
+
" TemporalGapCheck,\n",
|
|
90
|
+
" TemporalQualityReporter,\n",
|
|
87
91
|
")\n",
|
|
88
|
-
"from customer_retention.stages.temporal import
|
|
89
|
-
"from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
|
|
92
|
+
"from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n"
|
|
90
93
|
]
|
|
91
94
|
},
|
|
92
95
|
{
|
|
@@ -397,7 +400,7 @@
|
|
|
397
400
|
"if numeric_cols:\n",
|
|
398
401
|
" analyzer = SegmentAwareOutlierAnalyzer(max_segments=5)\n",
|
|
399
402
|
" result = analyzer.analyze(df, feature_cols=numeric_cols, segment_col=None, target_col=findings.target_column)\n",
|
|
400
|
-
"
|
|
403
|
+
"\n",
|
|
401
404
|
" print(f\"Segments detected: {result.n_segments}\")\n",
|
|
402
405
|
" if result.n_segments > 1:\n",
|
|
403
406
|
" data = [{\"Feature\": c, \"Global\": result.global_analysis[c].outliers_detected,\n",
|
|
@@ -456,25 +459,7 @@
|
|
|
456
459
|
"tags": []
|
|
457
460
|
},
|
|
458
461
|
"outputs": [],
|
|
459
|
-
"source": [
|
|
460
|
-
"# Binary field validation\n",
|
|
461
|
-
"binary_cols = [n for n, c in findings.columns.items() if c.inferred_type == ColumnType.BINARY and n not in TEMPORAL_METADATA_COLS]\n",
|
|
462
|
-
"for col in binary_cols:\n",
|
|
463
|
-
" c0, c1 = (df[col] == 0).sum(), (df[col] == 1).sum()\n",
|
|
464
|
-
" print(f\"✓ {col}: 0={c0:,} ({c0/(c0+c1)*100:.1f}%), 1={c1:,} ({c1/(c0+c1)*100:.1f}%)\")\n",
|
|
465
|
-
"\n",
|
|
466
|
-
"# Consistency check\n",
|
|
467
|
-
"issues = []\n",
|
|
468
|
-
"for col in df.select_dtypes(include=['object']).columns:\n",
|
|
469
|
-
" if col in [ENTITY_COLUMN, TIME_COLUMN]: continue\n",
|
|
470
|
-
" variants = {}\n",
|
|
471
|
-
" for v in df[col].dropna().unique():\n",
|
|
472
|
-
" key = str(v).lower().strip()\n",
|
|
473
|
-
" variants.setdefault(key, []).append(v)\n",
|
|
474
|
-
" issues.extend([{\"Column\": col, \"Variants\": vs} for vs in variants.values() if len(vs) > 1])\n",
|
|
475
|
-
"\n",
|
|
476
|
-
"print(f\"\\n{'⚠️ Consistency issues: ' + str(len(issues)) if issues else '✅ No consistency issues'}\")"
|
|
477
|
-
]
|
|
462
|
+
"source": "# Binary field validation\nbinary_cols = [n for n, c in findings.columns.items() if c.inferred_type == ColumnType.BINARY and n not in TEMPORAL_METADATA_COLS]\nfor col in binary_cols:\n c0, c1 = (df[col] == 0).sum(), (df[col] == 1).sum()\n print(f\"✓ {col}: 0={c0:,} ({c0/(c0+c1)*100:.1f}%), 1={c1:,} ({c1/(c0+c1)*100:.1f}%)\")\n\n# Consistency check\nissues = []\nfor col in df.select_dtypes(include=['object']).columns:\n if col in [ENTITY_COLUMN, TIME_COLUMN]:\n continue\n variants = {}\n for v in df[col].dropna().unique():\n key = str(v).lower().strip()\n variants.setdefault(key, []).append(v)\n issues.extend([{\"Column\": col, \"Variants\": vs} for vs in variants.values() if len(vs) > 1])\n\nprint(f\"\\n{'⚠️ Consistency issues: ' + str(len(issues)) if issues else '✅ No consistency issues'}\")"
|
|
478
463
|
},
|
|
479
464
|
{
|
|
480
465
|
"cell_type": "markdown",
|
|
@@ -676,4 +661,4 @@
|
|
|
676
661
|
},
|
|
677
662
|
"nbformat": 4,
|
|
678
663
|
"nbformat_minor": 5
|
|
679
|
-
}
|
|
664
|
+
}
|