churnkit 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +10 -5
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +11 -9
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +52 -46
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +68 -65
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +12 -27
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +216 -221
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +88 -81
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +111 -108
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +44 -38
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +89 -85
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +81 -80
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +83 -89
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +102 -98
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +32 -31
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +33 -29
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +6 -5
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +67 -63
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +38 -23
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +3 -1
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/METADATA +1 -1
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/RECORD +31 -31
- customer_retention/__init__.py +1 -1
- customer_retention/analysis/auto_explorer/explorer.py +2 -2
- customer_retention/analysis/notebook_progress.py +14 -2
- customer_retention/core/compat/__init__.py +10 -0
- customer_retention/core/config/experiments.py +45 -0
- customer_retention/integrations/databricks_init.py +41 -1
- customer_retention/stages/profiling/column_profiler.py +9 -2
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/WHEEL +0 -0
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/entry_points.txt +0 -0
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -95,36 +95,30 @@
|
|
|
95
95
|
"outputs": [],
|
|
96
96
|
"source": [
|
|
97
97
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
98
|
+
"\n",
|
|
98
99
|
"track_and_export_previous(\"05_multi_dataset.ipynb\")\n",
|
|
99
100
|
"\n",
|
|
101
|
+
"import pandas as pd\n",
|
|
102
|
+
"import plotly.graph_objects as go\n",
|
|
103
|
+
"import yaml\n",
|
|
104
|
+
"from plotly.subplots import make_subplots\n",
|
|
105
|
+
"\n",
|
|
100
106
|
"from customer_retention.analysis.auto_explorer import (\n",
|
|
101
107
|
" ExplorationManager,\n",
|
|
102
|
-
" MultiDatasetFindings,\n",
|
|
103
|
-
" ExplorationFindings,\n",
|
|
104
108
|
" RecommendationRegistry,\n",
|
|
105
109
|
")\n",
|
|
110
|
+
"from customer_retention.analysis.visualization import display_figure\n",
|
|
111
|
+
"from customer_retention.core.config.column_config import ColumnType, DatasetGranularity\n",
|
|
112
|
+
"from customer_retention.core.config.experiments import FINDINGS_DIR\n",
|
|
106
113
|
"from customer_retention.stages.profiling import (\n",
|
|
114
|
+
" DimensionReductionMethod,\n",
|
|
115
|
+
" FeatureCapacityAnalyzer,\n",
|
|
116
|
+
" FeatureGroup,\n",
|
|
117
|
+
" ReferenceMode,\n",
|
|
107
118
|
" RelationshipDetector,\n",
|
|
108
|
-
" TimeWindowAggregator,\n",
|
|
109
|
-
" RelationshipType,\n",
|
|
110
119
|
" SegmentAnalyzer,\n",
|
|
111
|
-
" SegmentationMethod,\n",
|
|
112
|
-
" FeatureCapacityAnalyzer,\n",
|
|
113
|
-
" TemporalFeatureEngineer,\n",
|
|
114
120
|
" TemporalAggregationConfig,\n",
|
|
115
|
-
"
|
|
116
|
-
" FeatureGroup,\n",
|
|
117
|
-
" DimensionReductionMethod,\n",
|
|
118
|
-
")\n",
|
|
119
|
-
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
120
|
-
"from customer_retention.core.config.column_config import DatasetGranularity, ColumnType\n",
|
|
121
|
-
"from pathlib import Path\n",
|
|
122
|
-
"import yaml\n",
|
|
123
|
-
"import pandas as pd\n",
|
|
124
|
-
"import numpy as np\n",
|
|
125
|
-
"import plotly.graph_objects as go\n",
|
|
126
|
-
"from plotly.subplots import make_subplots\n",
|
|
127
|
-
"from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
|
|
121
|
+
")\n"
|
|
128
122
|
]
|
|
129
123
|
},
|
|
130
124
|
{
|
|
@@ -211,7 +205,7 @@
|
|
|
211
205
|
"for ds in datasets:\n",
|
|
212
206
|
" granularity_emoji = \"\\U0001f4ca\" if ds.granularity == DatasetGranularity.ENTITY_LEVEL else \"\\U0001f4c8\"\n",
|
|
213
207
|
" target_info = f\" [TARGET: {ds.target_column}]\" if ds.target_column else \"\"\n",
|
|
214
|
-
"
|
|
208
|
+
"\n",
|
|
215
209
|
" print(f\"{granularity_emoji} {ds.name}\")\n",
|
|
216
210
|
" print(f\" Granularity: {ds.granularity.value}\")\n",
|
|
217
211
|
" print(f\" Rows: {ds.row_count:,} | Columns: {ds.column_count}\")\n",
|
|
@@ -268,7 +262,7 @@
|
|
|
268
262
|
" names = [ds.name for ds in datasets]\n",
|
|
269
263
|
" rows = [ds.row_count for ds in datasets]\n",
|
|
270
264
|
" cols = [ds.column_count for ds in datasets]\n",
|
|
271
|
-
" granularities = [\"Entity\" if ds.granularity == DatasetGranularity.ENTITY_LEVEL else \"Event\"
|
|
265
|
+
" granularities = [\"Entity\" if ds.granularity == DatasetGranularity.ENTITY_LEVEL else \"Event\"\n",
|
|
272
266
|
" for ds in datasets]\n",
|
|
273
267
|
" colors = [\"#2ecc71\" if ds.granularity == DatasetGranularity.ENTITY_LEVEL else \"#3498db\"\n",
|
|
274
268
|
" for ds in datasets]\n",
|
|
@@ -323,7 +317,7 @@
|
|
|
323
317
|
" text=\"<b>Primary Entity</b>\", showarrow=False, font=dict(size=11, color=\"#666\"), xanchor=\"left\"))\n",
|
|
324
318
|
" y_pos -= 0.06\n",
|
|
325
319
|
" annotations.append(dict(x=0.01, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
326
|
-
" text=f\"<span style='color:{primary_color}'>{primary_name}</span>\"
|
|
320
|
+
" text=f\"<span style='color:{primary_color}'>{primary_name}</span>\",\n",
|
|
327
321
|
" showarrow=False, font=dict(size=12), xanchor=\"left\"))\n",
|
|
328
322
|
" y_pos -= 0.10\n",
|
|
329
323
|
"\n",
|
|
@@ -331,7 +325,7 @@
|
|
|
331
325
|
" annotations.append(dict(x=0.01, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
332
326
|
" text=\"<b>Event Datasets</b>\", showarrow=False, font=dict(size=11, color=\"#666\"), xanchor=\"left\"))\n",
|
|
333
327
|
" y_pos -= 0.06\n",
|
|
334
|
-
"
|
|
328
|
+
"\n",
|
|
335
329
|
" if multi.event_datasets:\n",
|
|
336
330
|
" # Show each event dataset on its own line (supports 20+ datasets)\n",
|
|
337
331
|
" max_display = min(len(multi.event_datasets), 8) # Show up to 8, then summarize\n",
|
|
@@ -339,7 +333,7 @@
|
|
|
339
333
|
" annotations.append(dict(x=0.03, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
340
334
|
" text=f\"• {event_name}\", showarrow=False, font=dict(size=10, color=\"#3498db\"), xanchor=\"left\"))\n",
|
|
341
335
|
" y_pos -= 0.045\n",
|
|
342
|
-
"
|
|
336
|
+
"\n",
|
|
343
337
|
" if len(multi.event_datasets) > max_display:\n",
|
|
344
338
|
" remaining = len(multi.event_datasets) - max_display\n",
|
|
345
339
|
" annotations.append(dict(x=0.03, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
@@ -369,7 +363,7 @@
|
|
|
369
363
|
" # Hide axes on left panel\n",
|
|
370
364
|
" fig.update_xaxes(visible=False, row=1, col=1)\n",
|
|
371
365
|
" fig.update_yaxes(visible=False, row=1, col=1)\n",
|
|
372
|
-
"
|
|
366
|
+
"\n",
|
|
373
367
|
" # Configure horizontal bar axes\n",
|
|
374
368
|
" fig.update_yaxes(categoryorder='total ascending', row=1, col=2)\n",
|
|
375
369
|
" fig.update_yaxes(categoryorder='total ascending', row=2, col=2)\n",
|
|
@@ -502,18 +496,18 @@
|
|
|
502
496
|
"# If we have a primary entity dataset and event datasets, try to detect relationships\n",
|
|
503
497
|
"if multi.primary_entity_dataset and multi.event_datasets:\n",
|
|
504
498
|
" primary_info = multi.datasets[multi.primary_entity_dataset]\n",
|
|
505
|
-
"
|
|
499
|
+
"\n",
|
|
506
500
|
" print(f\"Primary dataset: {multi.primary_entity_dataset}\")\n",
|
|
507
|
-
" print(
|
|
508
|
-
"
|
|
501
|
+
" print(\"Checking relationships with event datasets...\\n\")\n",
|
|
502
|
+
"\n",
|
|
509
503
|
" for event_name in multi.event_datasets:\n",
|
|
510
504
|
" event_info = multi.datasets[event_name]\n",
|
|
511
|
-
"
|
|
505
|
+
"\n",
|
|
512
506
|
" # Check if they share common column names\n",
|
|
513
507
|
" if event_info.entity_column:\n",
|
|
514
508
|
" print(f\"\\U0001f517 {multi.primary_entity_dataset} <-> {event_name}\")\n",
|
|
515
509
|
" print(f\" Potential join column: {event_info.entity_column}\")\n",
|
|
516
|
-
" print(
|
|
510
|
+
" print(\" Expected relationship: one_to_many\")\n",
|
|
517
511
|
" print()\n",
|
|
518
512
|
"else:\n",
|
|
519
513
|
" print(\"Not enough datasets to detect relationships.\")\n",
|
|
@@ -667,7 +661,7 @@
|
|
|
667
661
|
"print(\"=\"*70 + \"\\n\")\n",
|
|
668
662
|
"\n",
|
|
669
663
|
"for group in FeatureGroup:\n",
|
|
670
|
-
" enabled = \"✓\" if group in [FeatureGroup.LAGGED_WINDOWS, FeatureGroup.VELOCITY
|
|
664
|
+
" enabled = \"✓\" if group in [FeatureGroup.LAGGED_WINDOWS, FeatureGroup.VELOCITY,\n",
|
|
671
665
|
" FeatureGroup.RECENCY, FeatureGroup.REGULARITY] else \"○\"\n",
|
|
672
666
|
" print(f\" {enabled} {group.value}\")"
|
|
673
667
|
]
|
|
@@ -748,14 +742,14 @@
|
|
|
748
742
|
"for dataset_name in multi.event_datasets:\n",
|
|
749
743
|
" ds_info = multi.datasets[dataset_name]\n",
|
|
750
744
|
" findings = manager.load_findings(dataset_name)\n",
|
|
751
|
-
"
|
|
745
|
+
"\n",
|
|
752
746
|
" if findings:\n",
|
|
753
747
|
" numeric_cols = [\n",
|
|
754
748
|
" name for name, col in findings.columns.items()\n",
|
|
755
749
|
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
756
750
|
" and name not in [ds_info.entity_column, ds_info.time_column] and name not in TEMPORAL_METADATA_COLS\n",
|
|
757
751
|
" ]\n",
|
|
758
|
-
"
|
|
752
|
+
"\n",
|
|
759
753
|
" if numeric_cols:\n",
|
|
760
754
|
" registry.add_silver_temporal_config(\n",
|
|
761
755
|
" source_dataset=dataset_name,\n",
|
|
@@ -830,13 +824,13 @@
|
|
|
830
824
|
"\n",
|
|
831
825
|
"for dataset_name in multi.event_datasets:\n",
|
|
832
826
|
" ds_info = multi.datasets[dataset_name]\n",
|
|
833
|
-
"
|
|
827
|
+
"\n",
|
|
834
828
|
" print(f\"\\U0001f4c8 From {dataset_name}:\")\n",
|
|
835
829
|
" print()\n",
|
|
836
|
-
"
|
|
830
|
+
"\n",
|
|
837
831
|
" # Load findings to see numeric columns\n",
|
|
838
832
|
" findings = manager.load_findings(dataset_name)\n",
|
|
839
|
-
"
|
|
833
|
+
"\n",
|
|
840
834
|
" # Find numeric columns that could be aggregated\n",
|
|
841
835
|
" numeric_cols = []\n",
|
|
842
836
|
" if findings:\n",
|
|
@@ -845,7 +839,7 @@
|
|
|
845
839
|
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
846
840
|
" and name not in [ds_info.entity_column, ds_info.time_column] and name not in TEMPORAL_METADATA_COLS\n",
|
|
847
841
|
" ]\n",
|
|
848
|
-
"
|
|
842
|
+
"\n",
|
|
849
843
|
" # Group 1: Lagged Window Features\n",
|
|
850
844
|
" if FeatureGroup.LAGGED_WINDOWS in FEATURE_GROUPS:\n",
|
|
851
845
|
" print(\" 📊 LAGGED WINDOWS (Group 1):\")\n",
|
|
@@ -853,21 +847,21 @@
|
|
|
853
847
|
" features = [f\"lag{i}_{col}_{agg}\" for i in range(NUM_LAGS) for agg in LAG_AGGREGATIONS[:2]]\n",
|
|
854
848
|
" print(f\" {col}: {features[:4]}...\")\n",
|
|
855
849
|
" print(f\" Total: {len(numeric_cols)} cols × {NUM_LAGS} lags × {len(LAG_AGGREGATIONS)} aggs\")\n",
|
|
856
|
-
"
|
|
850
|
+
"\n",
|
|
857
851
|
" # Group 2: Velocity Features\n",
|
|
858
852
|
" if FeatureGroup.VELOCITY in FEATURE_GROUPS:\n",
|
|
859
853
|
" print(\"\\n 🚀 VELOCITY (Group 2):\")\n",
|
|
860
854
|
" for col in numeric_cols[:2]:\n",
|
|
861
855
|
" print(f\" - {col}_velocity, {col}_velocity_pct\")\n",
|
|
862
856
|
" print(f\" Total: {len(numeric_cols)} cols × 2 features\")\n",
|
|
863
|
-
"
|
|
857
|
+
"\n",
|
|
864
858
|
" # Group 3: Acceleration Features\n",
|
|
865
859
|
" if FeatureGroup.ACCELERATION in FEATURE_GROUPS:\n",
|
|
866
860
|
" print(\"\\n ⚡ ACCELERATION (Group 3):\")\n",
|
|
867
861
|
" for col in numeric_cols[:2]:\n",
|
|
868
862
|
" print(f\" - {col}_acceleration, {col}_momentum\")\n",
|
|
869
863
|
" print(f\" Total: {len(numeric_cols)} cols × 2 features\")\n",
|
|
870
|
-
"
|
|
864
|
+
"\n",
|
|
871
865
|
" # Group 4: Lifecycle Features\n",
|
|
872
866
|
" if FeatureGroup.LIFECYCLE in FEATURE_GROUPS:\n",
|
|
873
867
|
" print(\"\\n 📈 LIFECYCLE (Group 4):\")\n",
|
|
@@ -875,7 +869,7 @@
|
|
|
875
869
|
" print(f\" - {col}_beginning, {col}_middle, {col}_end, {col}_trend_ratio\")\n",
|
|
876
870
|
" print(f\" Total: {len(numeric_cols)} cols × 4 features\")\n",
|
|
877
871
|
" print(f\" ℹ️ Requires {MIN_HISTORY_DAYS}+ days of history (else NaN)\")\n",
|
|
878
|
-
"
|
|
872
|
+
"\n",
|
|
879
873
|
" # Group 5: Recency Features\n",
|
|
880
874
|
" if FeatureGroup.RECENCY in FEATURE_GROUPS:\n",
|
|
881
875
|
" print(\"\\n ⏱️ RECENCY (Group 5):\")\n",
|
|
@@ -883,7 +877,7 @@
|
|
|
883
877
|
" print(\" - days_since_first_event\")\n",
|
|
884
878
|
" print(\" - active_span_days\")\n",
|
|
885
879
|
" print(\" - recency_ratio\")\n",
|
|
886
|
-
"
|
|
880
|
+
"\n",
|
|
887
881
|
" # Group 6: Regularity Features\n",
|
|
888
882
|
" if FeatureGroup.REGULARITY in FEATURE_GROUPS:\n",
|
|
889
883
|
" print(\"\\n 🎯 REGULARITY (Group 6):\")\n",
|
|
@@ -891,14 +885,14 @@
|
|
|
891
885
|
" print(\" - inter_event_gap_mean\")\n",
|
|
892
886
|
" print(\" - inter_event_gap_std\")\n",
|
|
893
887
|
" print(\" - regularity_score\")\n",
|
|
894
|
-
"
|
|
888
|
+
"\n",
|
|
895
889
|
" # Group 7: Cohort Comparison\n",
|
|
896
890
|
" if FeatureGroup.COHORT_COMPARISON in FEATURE_GROUPS:\n",
|
|
897
891
|
" print(\"\\n 👥 COHORT COMPARISON (Group 7):\")\n",
|
|
898
892
|
" for col in numeric_cols[:2]:\n",
|
|
899
893
|
" print(f\" - {col}_vs_cohort_mean, {col}_vs_cohort_pct, {col}_cohort_zscore\")\n",
|
|
900
894
|
" print(f\" Total: {len(numeric_cols)} cols × 3 features\")\n",
|
|
901
|
-
"
|
|
895
|
+
"\n",
|
|
902
896
|
" # Summary\n",
|
|
903
897
|
" total_features = 0\n",
|
|
904
898
|
" if FeatureGroup.LAGGED_WINDOWS in FEATURE_GROUPS:\n",
|
|
@@ -915,7 +909,7 @@
|
|
|
915
909
|
" total_features += 4\n",
|
|
916
910
|
" if FeatureGroup.COHORT_COMPARISON in FEATURE_GROUPS:\n",
|
|
917
911
|
" total_features += len(numeric_cols) * 3\n",
|
|
918
|
-
"
|
|
912
|
+
"\n",
|
|
919
913
|
" print(f\"\\n 📝 TOTAL ESTIMATED FEATURES: ~{total_features}\")\n",
|
|
920
914
|
" print()"
|
|
921
915
|
]
|
|
@@ -1000,12 +994,12 @@
|
|
|
1000
994
|
"if multi.primary_entity_dataset:\n",
|
|
1001
995
|
" primary_info = multi.datasets[multi.primary_entity_dataset]\n",
|
|
1002
996
|
" primary_findings = manager.load_findings(multi.primary_entity_dataset)\n",
|
|
1003
|
-
"
|
|
997
|
+
"\n",
|
|
1004
998
|
" if primary_findings:\n",
|
|
1005
999
|
" # Load the primary dataset from snapshot (not source) to get correct column names\n",
|
|
1006
1000
|
" primary_df, data_source = load_data_with_snapshot_preference(primary_findings, output_dir=str(FINDINGS_DIR))\n",
|
|
1007
1001
|
" print(f\" Loaded from: {data_source}\")\n",
|
|
1008
|
-
"
|
|
1002
|
+
"\n",
|
|
1009
1003
|
" # Get numeric features for clustering (exclude temporal metadata)\n",
|
|
1010
1004
|
" from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS\n",
|
|
1011
1005
|
" numeric_features = [\n",
|
|
@@ -1014,12 +1008,12 @@
|
|
|
1014
1008
|
" and name != primary_info.target_column\n",
|
|
1015
1009
|
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
1016
1010
|
" ]\n",
|
|
1017
|
-
"
|
|
1011
|
+
"\n",
|
|
1018
1012
|
" print(f\"\\n📊 Dataset: {multi.primary_entity_dataset}\")\n",
|
|
1019
1013
|
" print(f\" Total Samples: {len(primary_df):,}\")\n",
|
|
1020
1014
|
" print(f\" Numeric Features: {len(numeric_features)}\")\n",
|
|
1021
1015
|
" print(f\" Target Column: {primary_info.target_column}\")\n",
|
|
1022
|
-
"
|
|
1016
|
+
"\n",
|
|
1023
1017
|
" # Run full segmentation analysis using framework\n",
|
|
1024
1018
|
" analysis = segment_analyzer.run_full_analysis(\n",
|
|
1025
1019
|
" primary_df,\n",
|
|
@@ -1029,14 +1023,14 @@
|
|
|
1029
1023
|
" dim_reduction=DimensionReductionMethod.PCA,\n",
|
|
1030
1024
|
" )\n",
|
|
1031
1025
|
" m = analysis.metrics # Shorthand for metrics\n",
|
|
1032
|
-
"
|
|
1026
|
+
"\n",
|
|
1033
1027
|
" # ============================================================\n",
|
|
1034
1028
|
" # KEY DECISION METRICS\n",
|
|
1035
1029
|
" # ============================================================\n",
|
|
1036
1030
|
" print(\"\\n\" + \"=\" * 70)\n",
|
|
1037
1031
|
" print(\"📊 CLUSTERING DECISION METRICS\")\n",
|
|
1038
1032
|
" print(\"=\" * 70)\n",
|
|
1039
|
-
"
|
|
1033
|
+
"\n",
|
|
1040
1034
|
" print(f\"\"\"\n",
|
|
1041
1035
|
"┌─────────────────────────────────────────────────────────────────────┐\n",
|
|
1042
1036
|
"│ METRIC │ VALUE │ INTERPRETATION │\n",
|
|
@@ -1046,19 +1040,19 @@
|
|
|
1046
1040
|
"│ Optimal Segments Found │ {m.n_segments} │ {m.segments_interpretation:<18} │\n",
|
|
1047
1041
|
"│ Overall Confidence │ {m.confidence:.0%} │ {m.confidence_interpretation:<18} │\n",
|
|
1048
1042
|
"└──────────────────────────────────┴─────────────┴────────────────────┘\"\"\")\n",
|
|
1049
|
-
"
|
|
1043
|
+
"\n",
|
|
1050
1044
|
" print(f\"\\n🎯 RECOMMENDATION: {m.recommendation.upper().replace('_', ' ')}\")\n",
|
|
1051
|
-
" print(
|
|
1045
|
+
" print(\"\\n📋 Supporting Evidence:\")\n",
|
|
1052
1046
|
" for r in m.rationale:\n",
|
|
1053
1047
|
" print(f\" • {r}\")\n",
|
|
1054
|
-
"
|
|
1048
|
+
"\n",
|
|
1055
1049
|
" # ============================================================\n",
|
|
1056
1050
|
" # SEGMENT PROFILES\n",
|
|
1057
1051
|
" # ============================================================\n",
|
|
1058
1052
|
" print(\"\\n\" + \"=\" * 70)\n",
|
|
1059
1053
|
" print(\"📊 SEGMENT PROFILES\")\n",
|
|
1060
1054
|
" print(\"=\" * 70 + \"\\n\")\n",
|
|
1061
|
-
"
|
|
1055
|
+
"\n",
|
|
1062
1056
|
" segment_data = [{\n",
|
|
1063
1057
|
" \"Segment\": f\"Segment {p.segment_id}\",\n",
|
|
1064
1058
|
" \"N (count)\": f\"{p.size:,}\",\n",
|
|
@@ -1067,34 +1061,34 @@
|
|
|
1067
1061
|
" \"Viable for ML\": \"✓\" if p.size >= 100 else \"⚠️\"\n",
|
|
1068
1062
|
" } for p in analysis.profiles]\n",
|
|
1069
1063
|
" display(pd.DataFrame(segment_data))\n",
|
|
1070
|
-
"
|
|
1064
|
+
"\n",
|
|
1071
1065
|
" sd = analysis.size_distribution\n",
|
|
1072
|
-
" print(
|
|
1066
|
+
" print(\"\\n📊 Size Distribution:\")\n",
|
|
1073
1067
|
" print(f\" Total datapoints: {sd['total']:,}\")\n",
|
|
1074
1068
|
" print(f\" Smallest segment: {sd['min_size']:,} ({sd['min_pct']:.1f}%)\")\n",
|
|
1075
1069
|
" print(f\" Largest segment: {sd['max_size']:,} ({sd['max_pct']:.1f}%)\")\n",
|
|
1076
1070
|
" print(f\" Balance ratio: {sd['balance_ratio']:.2f} (1.0 = perfectly balanced)\")\n",
|
|
1077
|
-
"
|
|
1071
|
+
"\n",
|
|
1078
1072
|
" # ============================================================\n",
|
|
1079
1073
|
" # CLUSTER VISUALIZATION\n",
|
|
1080
1074
|
" # ============================================================\n",
|
|
1081
1075
|
" if analysis.has_visualization:\n",
|
|
1082
1076
|
" viz = analysis.visualization\n",
|
|
1083
1077
|
" seg_result = analysis.segmentation_result\n",
|
|
1084
|
-
"
|
|
1078
|
+
"\n",
|
|
1085
1079
|
" fig = make_subplots(\n",
|
|
1086
1080
|
" rows=1, cols=3,\n",
|
|
1087
1081
|
" subplot_titles=(\n",
|
|
1088
|
-
" f\"Cluster Visualization (PCA, {viz.explained_variance_ratio:.0%} var)\"
|
|
1082
|
+
" f\"Cluster Visualization (PCA, {viz.explained_variance_ratio:.0%} var)\"\n",
|
|
1089
1083
|
" if viz.explained_variance_ratio else \"Cluster Visualization (PCA)\",\n",
|
|
1090
1084
|
" \"Segment Sizes\", \"Target Rate\"\n",
|
|
1091
1085
|
" ),\n",
|
|
1092
1086
|
" horizontal_spacing=0.12,\n",
|
|
1093
1087
|
" column_widths=[0.4, 0.3, 0.3]\n",
|
|
1094
1088
|
" )\n",
|
|
1095
|
-
"
|
|
1089
|
+
"\n",
|
|
1096
1090
|
" unique_labels = sorted(set(seg_result.labels[seg_result.labels >= 0]))\n",
|
|
1097
|
-
"
|
|
1091
|
+
"\n",
|
|
1098
1092
|
" # Scatter plot - consistent colors by segment ID\n",
|
|
1099
1093
|
" for label in unique_labels:\n",
|
|
1100
1094
|
" mask = seg_result.labels == label\n",
|
|
@@ -1106,12 +1100,12 @@
|
|
|
1106
1100
|
" marker=dict(color=color, size=6, opacity=0.6),\n",
|
|
1107
1101
|
" name=name, hovertemplate=f\"{name}<br>PC1: %{{x:.2f}}<br>PC2: %{{y:.2f}}<extra></extra>\"\n",
|
|
1108
1102
|
" ), row=1, col=1)\n",
|
|
1109
|
-
"
|
|
1103
|
+
"\n",
|
|
1110
1104
|
" # Short labels for bar charts (avoid overlap)\n",
|
|
1111
1105
|
" bar_labels = [f\"Seg {p.segment_id}\" for p in analysis.profiles]\n",
|
|
1112
1106
|
" sizes = [p.size for p in analysis.profiles]\n",
|
|
1113
1107
|
" bar_colors = [SEGMENT_COLORS.get(p.segment_id, '#888888') for p in analysis.profiles]\n",
|
|
1114
|
-
"
|
|
1108
|
+
"\n",
|
|
1115
1109
|
" # Size bars - numbers inside\n",
|
|
1116
1110
|
" fig.add_trace(go.Bar(\n",
|
|
1117
1111
|
" y=bar_labels, x=sizes, orientation='h',\n",
|
|
@@ -1121,7 +1115,7 @@
|
|
|
1121
1115
|
" showlegend=False,\n",
|
|
1122
1116
|
" hovertemplate=\"Segment %{y}<br>Count: %{x:,}<extra></extra>\"\n",
|
|
1123
1117
|
" ), row=1, col=2)\n",
|
|
1124
|
-
"
|
|
1118
|
+
"\n",
|
|
1125
1119
|
" # Target rates - consistent segment colors, numbers inside\n",
|
|
1126
1120
|
" if all(p.target_rate is not None for p in analysis.profiles):\n",
|
|
1127
1121
|
" rates = [p.target_rate * 100 for p in analysis.profiles]\n",
|
|
@@ -1136,16 +1130,16 @@
|
|
|
1136
1130
|
" overall = sum(p.target_rate * p.size for p in analysis.profiles) / sd['total'] * 100\n",
|
|
1137
1131
|
" fig.add_vline(x=overall, line_dash=\"dash\", line_color=\"#2c3e50\",\n",
|
|
1138
1132
|
" annotation_text=f\"Avg: {overall:.1f}%\", annotation_position=\"top\", row=1, col=3)\n",
|
|
1139
|
-
"
|
|
1133
|
+
"\n",
|
|
1140
1134
|
" fig.update_layout(\n",
|
|
1141
|
-
" title=\"Segment Analysis Overview\"
|
|
1142
|
-
" height=400
|
|
1135
|
+
" title=\"Segment Analysis Overview\",\n",
|
|
1136
|
+
" height=400,\n",
|
|
1143
1137
|
" template=\"plotly_white\",\n",
|
|
1144
1138
|
" legend=dict(\n",
|
|
1145
|
-
" orientation=\"h\"
|
|
1146
|
-
" yanchor=\"top\"
|
|
1139
|
+
" orientation=\"h\",\n",
|
|
1140
|
+
" yanchor=\"top\",\n",
|
|
1147
1141
|
" y=-0.15,\n",
|
|
1148
|
-
" xanchor=\"center\"
|
|
1142
|
+
" xanchor=\"center\",\n",
|
|
1149
1143
|
" x=0.5\n",
|
|
1150
1144
|
" ),\n",
|
|
1151
1145
|
" margin=dict(r=20, b=80)\n",
|
|
@@ -1153,11 +1147,11 @@
|
|
|
1153
1147
|
" fig.update_xaxes(title_text=\"PC1\", row=1, col=1)\n",
|
|
1154
1148
|
" fig.update_yaxes(title_text=\"PC2\", row=1, col=1)\n",
|
|
1155
1149
|
" display_figure(fig)\n",
|
|
1156
|
-
"
|
|
1157
|
-
" print(
|
|
1150
|
+
"\n",
|
|
1151
|
+
" print(\"\\n📈 CLUSTER VISUALIZATION:\")\n",
|
|
1158
1152
|
" print(f\" Method: PCA | Variance Explained: {viz.explained_variance_ratio:.1%}\" if viz.explained_variance_ratio else \" Method: PCA\")\n",
|
|
1159
|
-
" print(
|
|
1160
|
-
"
|
|
1153
|
+
" print(\" Colors: Seg 0=Blue, Seg 1=Red, Seg 2=Green, Seg 3=Purple\")\n",
|
|
1154
|
+
"\n",
|
|
1161
1155
|
" # ============================================================\n",
|
|
1162
1156
|
" # EPV CAPACITY ANALYSIS\n",
|
|
1163
1157
|
" # ============================================================\n",
|
|
@@ -1165,7 +1159,7 @@
|
|
|
1165
1159
|
" print(\"\\n\" + \"=\" * 70)\n",
|
|
1166
1160
|
" print(\"💡 SEGMENT CAPACITY ANALYSIS (EPV Check)\")\n",
|
|
1167
1161
|
" print(\"=\" * 70)\n",
|
|
1168
|
-
"
|
|
1162
|
+
"\n",
|
|
1169
1163
|
" primary_df['_segment'] = analysis.segmentation_result.labels\n",
|
|
1170
1164
|
" capacity = capacity_analyzer.analyze_segment_capacity(\n",
|
|
1171
1165
|
" primary_df[primary_df['_segment'] >= 0],\n",
|
|
@@ -1174,14 +1168,14 @@
|
|
|
1174
1168
|
" segment_col='_segment'\n",
|
|
1175
1169
|
" )\n",
|
|
1176
1170
|
" primary_df.drop('_segment', axis=1, inplace=True)\n",
|
|
1177
|
-
"
|
|
1171
|
+
"\n",
|
|
1178
1172
|
" print(f\"\\n🎯 Strategy: {capacity.recommended_strategy.upper()}\")\n",
|
|
1179
1173
|
" print(f\" Reason: {capacity.strategy_reason}\")\n",
|
|
1180
1174
|
" if capacity.viable_segments:\n",
|
|
1181
1175
|
" print(f\"\\n ✅ Viable segments: {capacity.viable_segments}\")\n",
|
|
1182
1176
|
" if capacity.insufficient_segments:\n",
|
|
1183
1177
|
" print(f\" ⚠️ Insufficient segments: {capacity.insufficient_segments}\")\n",
|
|
1184
|
-
"
|
|
1178
|
+
"\n",
|
|
1185
1179
|
" # Store in findings\n",
|
|
1186
1180
|
" multi.notes.update({\n",
|
|
1187
1181
|
" 'segmentation_recommendation': m.recommendation,\n",
|
|
@@ -1191,11 +1185,11 @@
|
|
|
1191
1185
|
" 'segment_strategy': capacity.recommended_strategy,\n",
|
|
1192
1186
|
" 'segment_sizes': {f\"segment_{p.segment_id}\": p.size for p in analysis.profiles}\n",
|
|
1193
1187
|
" })\n",
|
|
1194
|
-
"
|
|
1188
|
+
"\n",
|
|
1195
1189
|
" # Initialize bronze layer if not already done\n",
|
|
1196
1190
|
" if registry.bronze is None:\n",
|
|
1197
1191
|
" registry.init_bronze(primary_info.source_path)\n",
|
|
1198
|
-
"
|
|
1192
|
+
"\n",
|
|
1199
1193
|
" # Persist segmentation strategy to registry\n",
|
|
1200
1194
|
" registry.add_bronze_segmentation_strategy(\n",
|
|
1201
1195
|
" strategy=m.recommendation,\n",
|
|
@@ -1206,7 +1200,7 @@
|
|
|
1206
1200
|
" source_notebook=\"05_multi_dataset\"\n",
|
|
1207
1201
|
" )\n",
|
|
1208
1202
|
" print(f\"\\n✅ Persisted segmentation strategy to registry: {m.recommendation}\")\n",
|
|
1209
|
-
"
|
|
1203
|
+
"\n",
|
|
1210
1204
|
" # ============================================================\n",
|
|
1211
1205
|
" # DECISION SUMMARY\n",
|
|
1212
1206
|
" # ============================================================\n",
|
|
@@ -1263,26 +1257,26 @@
|
|
|
1263
1257
|
" print(\"\\n\" + \"=\"*70)\n",
|
|
1264
1258
|
" print(\"DATASET RELATIONSHIP DIAGRAM\")\n",
|
|
1265
1259
|
" print(\"=\"*70 + \"\\n\")\n",
|
|
1266
|
-
"
|
|
1260
|
+
"\n",
|
|
1267
1261
|
" # ASCII diagram\n",
|
|
1268
1262
|
" if multi.primary_entity_dataset:\n",
|
|
1269
1263
|
" primary = multi.primary_entity_dataset\n",
|
|
1270
1264
|
" primary_info = multi.datasets[primary]\n",
|
|
1271
|
-
"
|
|
1265
|
+
"\n",
|
|
1272
1266
|
" print(f\" +{'='*30}+\")\n",
|
|
1273
1267
|
" print(f\" | {primary:^26} | <- PRIMARY (has target)\")\n",
|
|
1274
1268
|
" print(f\" | {primary_info.row_count:,} rows{' '*15} |\")\n",
|
|
1275
1269
|
" if primary_info.target_column:\n",
|
|
1276
1270
|
" print(f\" | Target: {primary_info.target_column:<17} |\")\n",
|
|
1277
1271
|
" print(f\" +{'='*30}+\")\n",
|
|
1278
|
-
"
|
|
1272
|
+
"\n",
|
|
1279
1273
|
" for event_name in multi.event_datasets:\n",
|
|
1280
1274
|
" event_info = multi.datasets[event_name]\n",
|
|
1281
1275
|
" join_col = event_info.entity_column or \"?\"\n",
|
|
1282
|
-
"
|
|
1283
|
-
" print(
|
|
1276
|
+
"\n",
|
|
1277
|
+
" print(\" |\")\n",
|
|
1284
1278
|
" print(f\" | {join_col}\")\n",
|
|
1285
|
-
" print(
|
|
1279
|
+
" print(\" v\")\n",
|
|
1286
1280
|
" print(f\" +{'-'*30}+\")\n",
|
|
1287
1281
|
" print(f\" | {event_name:^26} | <- EVENT LEVEL\")\n",
|
|
1288
1282
|
" print(f\" | {event_info.row_count:,} rows{' '*15} |\")\n",
|
|
@@ -1340,7 +1334,7 @@
|
|
|
1340
1334
|
"registry.save(RECOMMENDATIONS_PATH)\n",
|
|
1341
1335
|
"\n",
|
|
1342
1336
|
"print(f\"\\n✅ Multi-dataset findings saved to: {MULTI_FINDINGS_PATH}\")\n",
|
|
1343
|
-
"print(
|
|
1337
|
+
"print(\"\\n Contents:\")\n",
|
|
1344
1338
|
"print(f\" - {len(multi.datasets)} datasets\")\n",
|
|
1345
1339
|
"print(f\" - {len(multi.relationships)} relationships\")\n",
|
|
1346
1340
|
"print(f\" - {len(multi.event_datasets)} event datasets to aggregate\")\n",
|