churnkit 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +10 -5
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +11 -9
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +52 -46
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +68 -65
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +12 -27
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +216 -221
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +88 -81
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +111 -108
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +44 -38
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +89 -85
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +81 -80
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +83 -89
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +102 -98
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +32 -31
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +33 -29
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +6 -5
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +67 -63
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +38 -23
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +3 -1
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/METADATA +1 -1
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/RECORD +31 -31
- customer_retention/__init__.py +1 -1
- customer_retention/analysis/auto_explorer/explorer.py +2 -2
- customer_retention/analysis/notebook_progress.py +14 -2
- customer_retention/core/compat/__init__.py +10 -0
- customer_retention/core/config/experiments.py +45 -0
- customer_retention/integrations/databricks_init.py +41 -1
- customer_retention/stages/profiling/column_profiler.py +9 -2
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/WHEEL +0 -0
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/entry_points.txt +0 -0
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -84,17 +84,20 @@
|
|
|
84
84
|
"outputs": [],
|
|
85
85
|
"source": [
|
|
86
86
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
87
|
+
"\n",
|
|
87
88
|
"track_and_export_previous(\"03_quality_assessment.ipynb\")\n",
|
|
88
89
|
"\n",
|
|
89
|
-
"from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationRegistry\n",
|
|
90
|
-
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
91
|
-
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
92
90
|
"import pandas as pd\n",
|
|
93
|
-
"import numpy as np\n",
|
|
94
|
-
"import plotly.graph_objects as go\n",
|
|
95
91
|
"import plotly.express as px\n",
|
|
92
|
+
"import plotly.graph_objects as go\n",
|
|
96
93
|
"from plotly.subplots import make_subplots\n",
|
|
97
|
-
"
|
|
94
|
+
"\n",
|
|
95
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationRegistry\n",
|
|
96
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure\n",
|
|
97
|
+
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
98
|
+
"from customer_retention.core.config.experiments import (\n",
|
|
99
|
+
" FINDINGS_DIR,\n",
|
|
100
|
+
")"
|
|
98
101
|
]
|
|
99
102
|
},
|
|
100
103
|
{
|
|
@@ -125,6 +128,7 @@
|
|
|
125
128
|
"\n",
|
|
126
129
|
"# Option 2: Auto-discover the most recent findings file\n",
|
|
127
130
|
"from pathlib import Path\n",
|
|
131
|
+
"\n",
|
|
128
132
|
"import yaml\n",
|
|
129
133
|
"\n",
|
|
130
134
|
"# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
|
|
@@ -156,7 +160,7 @@
|
|
|
156
160
|
"findings = ExplorationFindings.load(FINDINGS_PATH)\n",
|
|
157
161
|
"\n",
|
|
158
162
|
"# Load data - handle aggregated vs standard paths\n",
|
|
159
|
-
"from customer_retention.stages.temporal import
|
|
163
|
+
"from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
|
|
160
164
|
"\n",
|
|
161
165
|
"# For aggregated data, load directly from the parquet source\n",
|
|
162
166
|
"if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
|
|
@@ -193,7 +197,7 @@
|
|
|
193
197
|
" entity_col = next((name for name, col in findings.columns.items() if col.inferred_type == ColumnType.IDENTIFIER), None)\n",
|
|
194
198
|
" if entity_col:\n",
|
|
195
199
|
" registry.init_silver(entity_col)\n",
|
|
196
|
-
" print(
|
|
200
|
+
" print(\"Initialized new recommendation registry\")\n",
|
|
197
201
|
"\n",
|
|
198
202
|
"print(f\"\\nLoaded findings for {findings.column_count} columns\")"
|
|
199
203
|
]
|
|
@@ -252,7 +256,7 @@
|
|
|
252
256
|
"validator = DataValidator()\n",
|
|
253
257
|
"\n",
|
|
254
258
|
"# Auto-detect potential key columns\n",
|
|
255
|
-
"potential_keys = [name for name, col in findings.columns.items()
|
|
259
|
+
"potential_keys = [name for name, col in findings.columns.items()\n",
|
|
256
260
|
" if col.inferred_type.value in ('identifier', 'id') or 'id' in name.lower()]\n",
|
|
257
261
|
"KEY_COLUMN = potential_keys[0] if potential_keys else None\n",
|
|
258
262
|
"\n",
|
|
@@ -266,7 +270,7 @@
|
|
|
266
270
|
" print(f\"Total Rows: {dup_result.total_rows:,}\")\n",
|
|
267
271
|
" print(f\"Unique Keys: {dup_result.unique_keys:,}\")\n",
|
|
268
272
|
" print(f\"Duplicate Keys: {dup_result.duplicate_keys:,} ({dup_result.duplicate_percentage:.2f}%)\")\n",
|
|
269
|
-
"
|
|
273
|
+
"\n",
|
|
270
274
|
" # Exact duplicates\n",
|
|
271
275
|
" if dup_result.exact_duplicate_rows > 0:\n",
|
|
272
276
|
" print(f\"\\n⚠️ Exact duplicate rows: {dup_result.exact_duplicate_rows:,}\")\n",
|
|
@@ -275,7 +279,7 @@
|
|
|
275
279
|
" if len(dup_examples) > 0:\n",
|
|
276
280
|
" print(\"\\nExample duplicate rows:\")\n",
|
|
277
281
|
" display(dup_examples)\n",
|
|
278
|
-
"
|
|
282
|
+
"\n",
|
|
279
283
|
" # Add deduplication recommendation for exact duplicates\n",
|
|
280
284
|
" registry.add_bronze_deduplication(\n",
|
|
281
285
|
" key_column=KEY_COLUMN, strategy=\"drop_exact_duplicates\",\n",
|
|
@@ -284,13 +288,13 @@
|
|
|
284
288
|
" )\n",
|
|
285
289
|
" else:\n",
|
|
286
290
|
" print(\"\\n✓ No exact duplicate rows\")\n",
|
|
287
|
-
"
|
|
291
|
+
"\n",
|
|
288
292
|
" # Value conflicts\n",
|
|
289
293
|
" if dup_result.has_value_conflicts:\n",
|
|
290
294
|
" print(f\"\\n⚠️ Value conflicts detected in: {', '.join(dup_result.conflict_columns[:5])}\")\n",
|
|
291
295
|
" if findings.target_column and findings.target_column in dup_result.conflict_columns:\n",
|
|
292
296
|
" print(f\" 🔴 CRITICAL: Target '{findings.target_column}' has conflicting values!\")\n",
|
|
293
|
-
"
|
|
297
|
+
"\n",
|
|
294
298
|
" # Show examples of conflicting records\n",
|
|
295
299
|
" key_counts = df[KEY_COLUMN].value_counts()\n",
|
|
296
300
|
" dup_keys = key_counts[key_counts > 1].head(3).index.tolist()\n",
|
|
@@ -298,7 +302,7 @@
|
|
|
298
302
|
" print(\"\\nExample records with duplicate keys:\")\n",
|
|
299
303
|
" conflict_examples = df[df[KEY_COLUMN].isin(dup_keys)].sort_values(KEY_COLUMN).head(10)\n",
|
|
300
304
|
" display(conflict_examples)\n",
|
|
301
|
-
"
|
|
305
|
+
"\n",
|
|
302
306
|
" # Add deduplication recommendation for value conflicts\n",
|
|
303
307
|
" registry.add_bronze_deduplication(\n",
|
|
304
308
|
" key_column=KEY_COLUMN, strategy=\"keep_first\",\n",
|
|
@@ -308,7 +312,7 @@
|
|
|
308
312
|
" )\n",
|
|
309
313
|
" else:\n",
|
|
310
314
|
" print(\"\\n✓ No value conflicts\")\n",
|
|
311
|
-
"
|
|
315
|
+
"\n",
|
|
312
316
|
" # Duplicate frequency distribution\n",
|
|
313
317
|
" if dup_result.duplicate_keys > 0:\n",
|
|
314
318
|
" key_counts = df[KEY_COLUMN].value_counts()\n",
|
|
@@ -317,7 +321,7 @@
|
|
|
317
321
|
" print(\"\\nDuplicate frequency distribution:\")\n",
|
|
318
322
|
" for count, num_keys in dup_distribution.head(5).items():\n",
|
|
319
323
|
" print(f\" Keys appearing {count}x: {num_keys:,}\")\n",
|
|
320
|
-
"
|
|
324
|
+
"\n",
|
|
321
325
|
" # Recommendations\n",
|
|
322
326
|
" print(\"\\n💡 RECOMMENDATIONS:\")\n",
|
|
323
327
|
" if dup_result.exact_duplicate_rows > 0:\n",
|
|
@@ -435,7 +439,7 @@
|
|
|
435
439
|
"if findings.target_column and findings.target_column in df.columns:\n",
|
|
436
440
|
" target_series = df[findings.target_column]\n",
|
|
437
441
|
" target_counts = target_series.value_counts().sort_index()\n",
|
|
438
|
-
"
|
|
442
|
+
"\n",
|
|
439
443
|
" # Create distribution table\n",
|
|
440
444
|
" dist_data = []\n",
|
|
441
445
|
" for val, count in target_counts.items():\n",
|
|
@@ -445,10 +449,10 @@
|
|
|
445
449
|
" \"count\": count,\n",
|
|
446
450
|
" \"percentage\": f\"{pct:.3f}\"\n",
|
|
447
451
|
" })\n",
|
|
448
|
-
"
|
|
452
|
+
"\n",
|
|
449
453
|
" dist_df = pd.DataFrame(dist_data)\n",
|
|
450
454
|
" display(dist_df)\n",
|
|
451
|
-
"
|
|
455
|
+
"\n",
|
|
452
456
|
" # Calculate imbalance metrics\n",
|
|
453
457
|
" if len(target_counts) == 2:\n",
|
|
454
458
|
" majority = target_counts.max()\n",
|
|
@@ -456,10 +460,10 @@
|
|
|
456
460
|
" minority_class = target_counts.idxmin()\n",
|
|
457
461
|
" imbalance_ratio = majority / minority\n",
|
|
458
462
|
" retention_rate = target_counts.get(1, 0) / len(df) * 100\n",
|
|
459
|
-
"
|
|
463
|
+
"\n",
|
|
460
464
|
" print(f\"\\nImbalance ratio: {imbalance_ratio:.2f}:1 (minority class: {minority_class})\")\n",
|
|
461
465
|
" print(f\"Retention rate: {retention_rate:.1f}%\")\n",
|
|
462
|
-
"
|
|
466
|
+
"\n",
|
|
463
467
|
" # Business context\n",
|
|
464
468
|
" if retention_rate > 70:\n",
|
|
465
469
|
" print(f\"\\n📊 Business Context: {retention_rate:.0f}% retention is healthy!\")\n",
|
|
@@ -470,14 +474,14 @@
|
|
|
470
474
|
" else:\n",
|
|
471
475
|
" print(f\"\\n⚠️ Business Context: {retention_rate:.0f}% retention is concerning!\")\n",
|
|
472
476
|
" print(\" High churn rate requires urgent attention.\")\n",
|
|
473
|
-
"
|
|
477
|
+
"\n",
|
|
474
478
|
" # Modeling recommendations based on imbalance\n",
|
|
475
479
|
" print(\"\\n⚠️ Class imbalance considerations for modeling:\")\n",
|
|
476
480
|
" print(\" - Use stratified sampling for train/test splits\")\n",
|
|
477
481
|
" print(\" - Consider class weights in model training\")\n",
|
|
478
482
|
" print(\" - Evaluate with Precision-Recall AUC (not just ROC-AUC)\")\n",
|
|
479
483
|
" print(\" - Focus on recall for churned class (catch at-risk customers)\")\n",
|
|
480
|
-
"
|
|
484
|
+
"\n",
|
|
481
485
|
" # Add imbalance strategy recommendation\n",
|
|
482
486
|
" if imbalance_ratio < 3:\n",
|
|
483
487
|
" strategy = \"stratified_sampling\"\n",
|
|
@@ -491,7 +495,7 @@
|
|
|
491
495
|
" strategy = \"smote\"\n",
|
|
492
496
|
" rationale = f\"Severe imbalance ({imbalance_ratio:.2f}:1) - consider SMOTE\"\n",
|
|
493
497
|
" print(\" - Consider SMOTE or undersampling (imbalance is severe)\")\n",
|
|
494
|
-
"
|
|
498
|
+
"\n",
|
|
495
499
|
" registry.add_bronze_imbalance_strategy(\n",
|
|
496
500
|
" target_column=findings.target_column,\n",
|
|
497
501
|
" imbalance_ratio=imbalance_ratio,\n",
|
|
@@ -500,23 +504,23 @@
|
|
|
500
504
|
" rationale=rationale,\n",
|
|
501
505
|
" source_notebook=\"03_quality_assessment\"\n",
|
|
502
506
|
" )\n",
|
|
503
|
-
"
|
|
507
|
+
"\n",
|
|
504
508
|
" # Visualization\n",
|
|
505
509
|
" fig = make_subplots(rows=1, cols=2, specs=[[{\"type\": \"pie\"}, {\"type\": \"bar\"}]],\n",
|
|
506
510
|
" subplot_titles=[\"Class Distribution\", \"Count Comparison\"])\n",
|
|
507
|
-
"
|
|
511
|
+
"\n",
|
|
508
512
|
" labels = [f\"{'Retained' if v == 1 else 'Churned'} ({v})\" for v in target_counts.index]\n",
|
|
509
513
|
" fig.add_trace(go.Pie(labels=labels, values=target_counts.values, hole=0.4,\n",
|
|
510
514
|
" marker_colors=[\"#2ecc71\", \"#e74c3c\"]), row=1, col=1)\n",
|
|
511
515
|
" fig.add_trace(go.Bar(x=labels, y=target_counts.values,\n",
|
|
512
516
|
" marker_color=[\"#e74c3c\", \"#2ecc71\"]), row=1, col=2)\n",
|
|
513
|
-
"
|
|
517
|
+
"\n",
|
|
514
518
|
" fig.update_layout(height=350, title_text=\"Target Variable Distribution\",\n",
|
|
515
519
|
" showlegend=False, template=\"plotly_white\")\n",
|
|
516
520
|
" display_figure(fig)\n",
|
|
517
521
|
" else:\n",
|
|
518
522
|
" print(f\"\\nMulticlass target with {len(target_counts)} classes\")\n",
|
|
519
|
-
"
|
|
523
|
+
"\n",
|
|
520
524
|
" fig = go.Figure(go.Bar(x=[str(v) for v in target_counts.index], y=target_counts.values,\n",
|
|
521
525
|
" marker_color=px.colors.qualitative.Set2[:len(target_counts)]))\n",
|
|
522
526
|
" fig.update_layout(height=350, title_text=\"Target Variable Distribution\",\n",
|
|
@@ -593,7 +597,7 @@
|
|
|
593
597
|
" missing_df = pd.DataFrame(missing_data).sort_values(\"Missing Count\", ascending=False)\n",
|
|
594
598
|
" print(\"Columns with Missing Values:\")\n",
|
|
595
599
|
" display(missing_df)\n",
|
|
596
|
-
"
|
|
600
|
+
"\n",
|
|
597
601
|
" fig = charts.bar_chart(\n",
|
|
598
602
|
" missing_df[\"Column\"].tolist(),\n",
|
|
599
603
|
" [float(x.replace(\"%\", \"\")) for x in missing_df[\"Missing %\"].tolist()],\n",
|
|
@@ -742,7 +746,7 @@
|
|
|
742
746
|
"\n",
|
|
743
747
|
"if numeric_cols:\n",
|
|
744
748
|
" analyzer = SegmentAwareOutlierAnalyzer(max_segments=5)\n",
|
|
745
|
-
"
|
|
749
|
+
"\n",
|
|
746
750
|
" # Run segment-aware analysis\n",
|
|
747
751
|
" segment_result = analyzer.analyze(\n",
|
|
748
752
|
" df,\n",
|
|
@@ -750,31 +754,31 @@
|
|
|
750
754
|
" segment_col=SEGMENT_COL,\n",
|
|
751
755
|
" target_col=findings.target_column\n",
|
|
752
756
|
" )\n",
|
|
753
|
-
"
|
|
754
|
-
" print(
|
|
757
|
+
"\n",
|
|
758
|
+
" print(\"\\n📊 SEGMENTATION RESULTS:\")\n",
|
|
755
759
|
" print(f\" Segments detected: {segment_result.n_segments}\")\n",
|
|
756
|
-
"
|
|
760
|
+
"\n",
|
|
757
761
|
" if segment_result.n_segments > 1:\n",
|
|
758
|
-
" print(
|
|
762
|
+
" print(\"\\n📈 GLOBAL VS SEGMENT OUTLIER COMPARISON:\")\n",
|
|
759
763
|
" print(\"-\" * 60)\n",
|
|
760
|
-
"
|
|
764
|
+
"\n",
|
|
761
765
|
" comparison_data = []\n",
|
|
762
766
|
" for col in numeric_cols:\n",
|
|
763
767
|
" global_outliers = segment_result.global_analysis[col].outliers_detected\n",
|
|
764
768
|
" segment_outliers = sum(\n",
|
|
765
|
-
" seg[col].outliers_detected
|
|
769
|
+
" seg[col].outliers_detected\n",
|
|
766
770
|
" for seg in segment_result.segment_analysis.values()\n",
|
|
767
771
|
" if col in seg\n",
|
|
768
772
|
" )\n",
|
|
769
773
|
" false_outliers = segment_result.false_outliers.get(col, 0)\n",
|
|
770
|
-
"
|
|
774
|
+
"\n",
|
|
771
775
|
" if global_outliers > 0:\n",
|
|
772
776
|
" reduction_pct = (global_outliers - segment_outliers) / global_outliers * 100\n",
|
|
773
777
|
" false_pct = false_outliers / global_outliers * 100\n",
|
|
774
778
|
" else:\n",
|
|
775
779
|
" reduction_pct = 0\n",
|
|
776
780
|
" false_pct = 0\n",
|
|
777
|
-
"
|
|
781
|
+
"\n",
|
|
778
782
|
" comparison_data.append({\n",
|
|
779
783
|
" \"Feature\": col,\n",
|
|
780
784
|
" \"Global Outliers\": global_outliers,\n",
|
|
@@ -782,13 +786,13 @@
|
|
|
782
786
|
" \"False Outliers\": false_outliers,\n",
|
|
783
787
|
" \"Reduction\": f\"{reduction_pct:.1f}%\"\n",
|
|
784
788
|
" })\n",
|
|
785
|
-
"
|
|
789
|
+
"\n",
|
|
786
790
|
" comparison_df = pd.DataFrame(comparison_data)\n",
|
|
787
791
|
" display(comparison_df)\n",
|
|
788
|
-
"
|
|
792
|
+
"\n",
|
|
789
793
|
" # Show false outlier analysis\n",
|
|
790
794
|
" has_false_outliers = any(segment_result.false_outliers.get(col, 0) > 0 for col in numeric_cols)\n",
|
|
791
|
-
"
|
|
795
|
+
"\n",
|
|
792
796
|
" if has_false_outliers:\n",
|
|
793
797
|
" print(\"\\n⚠️ FALSE OUTLIERS DETECTED:\")\n",
|
|
794
798
|
" print(\" (Global outliers that are normal within their segment)\")\n",
|
|
@@ -797,14 +801,14 @@
|
|
|
797
801
|
" global_count = segment_result.global_analysis[col].outliers_detected\n",
|
|
798
802
|
" pct = count / global_count * 100 if global_count > 0 else 0\n",
|
|
799
803
|
" print(f\" • {col}: {count} false outliers ({pct:.1f}% of global)\")\n",
|
|
800
|
-
"
|
|
804
|
+
"\n",
|
|
801
805
|
" # Recommendations\n",
|
|
802
806
|
" print(\"\\n💡 RECOMMENDATIONS:\")\n",
|
|
803
807
|
" if segment_result.segmentation_recommended:\n",
|
|
804
808
|
" print(\" ✅ SEGMENT-SPECIFIC OUTLIER TREATMENT RECOMMENDED\")\n",
|
|
805
809
|
" for rec in segment_result.recommendations:\n",
|
|
806
810
|
" print(f\" • {rec}\")\n",
|
|
807
|
-
"
|
|
811
|
+
"\n",
|
|
808
812
|
" # Add outlier recommendations for columns with high false outlier rate\n",
|
|
809
813
|
" for col, count in segment_result.false_outliers.items():\n",
|
|
810
814
|
" if count > 0:\n",
|
|
@@ -819,27 +823,27 @@
|
|
|
819
823
|
" )\n",
|
|
820
824
|
" else:\n",
|
|
821
825
|
" print(\" ℹ️ Global outlier treatment is appropriate for this data\")\n",
|
|
822
|
-
"
|
|
826
|
+
"\n",
|
|
823
827
|
" # Rationale\n",
|
|
824
828
|
" print(\"\\n📋 RATIONALE:\")\n",
|
|
825
829
|
" for rationale in segment_result.rationale:\n",
|
|
826
830
|
" print(f\" • {rationale}\")\n",
|
|
827
|
-
"
|
|
831
|
+
"\n",
|
|
828
832
|
" # Visualization: Compare outlier counts\n",
|
|
829
833
|
" cols_with_diff = [\n",
|
|
830
834
|
" row[\"Feature\"] for _, row in comparison_df.iterrows()\n",
|
|
831
835
|
" if row[\"Global Outliers\"] > 0 and row[\"Global Outliers\"] != row[\"Segment Outliers\"]\n",
|
|
832
836
|
" ]\n",
|
|
833
|
-
"
|
|
837
|
+
"\n",
|
|
834
838
|
" if cols_with_diff and len(cols_with_diff) <= 8:\n",
|
|
835
839
|
" fig = go.Figure()\n",
|
|
836
|
-
"
|
|
840
|
+
"\n",
|
|
837
841
|
" global_counts = [comparison_df[comparison_df[\"Feature\"] == c][\"Global Outliers\"].values[0] for c in cols_with_diff]\n",
|
|
838
842
|
" segment_counts = [comparison_df[comparison_df[\"Feature\"] == c][\"Segment Outliers\"].values[0] for c in cols_with_diff]\n",
|
|
839
|
-
"
|
|
843
|
+
"\n",
|
|
840
844
|
" fig.add_trace(go.Bar(name=\"Global Outliers\", x=cols_with_diff, y=global_counts, marker_color=\"#e74c3c\"))\n",
|
|
841
845
|
" fig.add_trace(go.Bar(name=\"Segment Outliers\", x=cols_with_diff, y=segment_counts, marker_color=\"#2ecc71\"))\n",
|
|
842
|
-
"
|
|
846
|
+
"\n",
|
|
843
847
|
" fig.update_layout(\n",
|
|
844
848
|
" barmode=\"group\",\n",
|
|
845
849
|
" title=\"Global vs Segment-Specific Outlier Detection\",\n",
|
|
@@ -852,7 +856,7 @@
|
|
|
852
856
|
" else:\n",
|
|
853
857
|
" print(\"\\n ℹ️ Data appears homogeneous (single segment)\")\n",
|
|
854
858
|
" print(\" → Proceeding with standard global outlier detection\")\n",
|
|
855
|
-
"
|
|
859
|
+
"\n",
|
|
856
860
|
" # Store result in findings metadata for use in later notebooks\n",
|
|
857
861
|
" findings.metadata[\"segment_aware_analysis\"] = {\n",
|
|
858
862
|
" \"n_segments\": segment_result.n_segments,\n",
|
|
@@ -934,11 +938,11 @@
|
|
|
934
938
|
" iqr = q3 - q1\n",
|
|
935
939
|
" lower_bound = q1 - 1.5 * iqr\n",
|
|
936
940
|
" upper_bound = q3 + 1.5 * iqr\n",
|
|
937
|
-
"
|
|
941
|
+
"\n",
|
|
938
942
|
" outliers_low = (series < lower_bound).sum()\n",
|
|
939
943
|
" outliers_high = (series > upper_bound).sum()\n",
|
|
940
944
|
" total_outliers = outliers_low + outliers_high\n",
|
|
941
|
-
"
|
|
945
|
+
"\n",
|
|
942
946
|
" outlier_data.append({\n",
|
|
943
947
|
" \"feature\": col_name,\n",
|
|
944
948
|
" \"Q1\": q1,\n",
|
|
@@ -956,7 +960,7 @@
|
|
|
956
960
|
"\n",
|
|
957
961
|
"# Display IQR bounds table\n",
|
|
958
962
|
"print(\"\\n📊 IQR BOUNDS TABLE:\")\n",
|
|
959
|
-
"bounds_display = outlier_df[[\"feature\", \"Q1\", \"Q3\", \"IQR\", \"lower_bound\", \"upper_bound\"
|
|
963
|
+
"bounds_display = outlier_df[[\"feature\", \"Q1\", \"Q3\", \"IQR\", \"lower_bound\", \"upper_bound\",\n",
|
|
960
964
|
" \"outliers_low\", \"outliers_high\"]].copy()\n",
|
|
961
965
|
"for col in [\"Q1\", \"Q3\", \"IQR\", \"lower_bound\", \"upper_bound\"]:\n",
|
|
962
966
|
" bounds_display[col] = bounds_display[col].apply(lambda x: f\"{x:.2f}\")\n",
|
|
@@ -974,11 +978,11 @@
|
|
|
974
978
|
" print(f\" Below lower: {row['outliers_low']:,}\")\n",
|
|
975
979
|
" if row[\"outliers_high\"] > 0:\n",
|
|
976
980
|
" print(f\" Above upper: {row['outliers_high']:,}\")\n",
|
|
977
|
-
"
|
|
981
|
+
"\n",
|
|
978
982
|
" # Determine action and add recommendation (skip if segment-aware already added)\n",
|
|
979
983
|
" col_name = row['feature']\n",
|
|
980
984
|
" existing_outlier_recs = [r for r in registry.bronze.outlier_handling if r.target_column == col_name]\n",
|
|
981
|
-
"
|
|
985
|
+
"\n",
|
|
982
986
|
" if not existing_outlier_recs and row[\"outlier_pct\"] > 5: # Only add if significant and not already handled\n",
|
|
983
987
|
" if row[\"outlier_pct\"] > 10:\n",
|
|
984
988
|
" action = \"log_transform\"\n",
|
|
@@ -988,7 +992,7 @@
|
|
|
988
992
|
" action = \"winsorize\"\n",
|
|
989
993
|
" rationale = f\"{row['outlier_pct']:.1f}% outliers - winsorize to 1st/99th percentile\"\n",
|
|
990
994
|
" print(\" → Consider Winsorization (clip to 1st/99th percentile)\")\n",
|
|
991
|
-
"
|
|
995
|
+
"\n",
|
|
992
996
|
" registry.add_bronze_outlier(\n",
|
|
993
997
|
" column=col_name, action=action,\n",
|
|
994
998
|
" parameters={\"method\": \"iqr\", \"lower_bound\": row[\"lower_bound\"], \"upper_bound\": row[\"upper_bound\"]},\n",
|
|
@@ -1003,16 +1007,16 @@
|
|
|
1003
1007
|
"# Box plots for columns with outliers\n",
|
|
1004
1008
|
"if len(cols_with_outliers) > 0 and len(cols_with_outliers) <= 6:\n",
|
|
1005
1009
|
" outlier_cols = cols_with_outliers[\"feature\"].tolist()\n",
|
|
1006
|
-
"
|
|
1010
|
+
"\n",
|
|
1007
1011
|
" fig = make_subplots(rows=1, cols=len(outlier_cols), subplot_titles=outlier_cols)\n",
|
|
1008
|
-
"
|
|
1012
|
+
"\n",
|
|
1009
1013
|
" for i, col in enumerate(outlier_cols, 1):\n",
|
|
1010
1014
|
" fig.add_trace(\n",
|
|
1011
1015
|
" go.Box(y=df[col].dropna(), name=col, boxpoints=\"outliers\",\n",
|
|
1012
1016
|
" marker_color=\"#3498db\", showlegend=False),\n",
|
|
1013
1017
|
" row=1, col=i\n",
|
|
1014
1018
|
" )\n",
|
|
1015
|
-
"
|
|
1019
|
+
"\n",
|
|
1016
1020
|
" fig.update_layout(height=400, title_text=\"Outlier Distribution (Box Plots)\",\n",
|
|
1017
1021
|
" template=\"plotly_white\")\n",
|
|
1018
1022
|
" display_figure(fig)"
|
|
@@ -1082,7 +1086,7 @@
|
|
|
1082
1086
|
" print(f\"Loaded date sequence from findings: {DATE_SEQUENCE}\")\n",
|
|
1083
1087
|
"\n",
|
|
1084
1088
|
"# Detect date columns from findings\n",
|
|
1085
|
-
"date_cols = [name for name, col in findings.columns.items()
|
|
1089
|
+
"date_cols = [name for name, col in findings.columns.items()\n",
|
|
1086
1090
|
" if col.inferred_type == ColumnType.DATETIME]\n",
|
|
1087
1091
|
"\n",
|
|
1088
1092
|
"print(\"=\" * 60)\n",
|
|
@@ -1094,12 +1098,12 @@
|
|
|
1094
1098
|
" df_dates = df.copy()\n",
|
|
1095
1099
|
" for col in date_cols:\n",
|
|
1096
1100
|
" df_dates[col] = pd.to_datetime(df_dates[col], errors='coerce', format='mixed')\n",
|
|
1097
|
-
"
|
|
1101
|
+
"\n",
|
|
1098
1102
|
" # Date ranges\n",
|
|
1099
1103
|
" print(\"\\n📅 DATE RANGES:\")\n",
|
|
1100
1104
|
" for col in date_cols:\n",
|
|
1101
1105
|
" print(f\" {col}: {df_dates[col].min()} to {df_dates[col].max()}\")\n",
|
|
1102
|
-
"
|
|
1106
|
+
"\n",
|
|
1103
1107
|
" # Placeholder detection\n",
|
|
1104
1108
|
" print(\"\\n🕵️ PLACEHOLDER DATE DETECTION:\")\n",
|
|
1105
1109
|
" for col in date_cols:\n",
|
|
@@ -1108,14 +1112,14 @@
|
|
|
1108
1112
|
" print(f\" {col}: {old_dates:,} dates before 2005 (possible placeholders)\")\n",
|
|
1109
1113
|
" else:\n",
|
|
1110
1114
|
" print(f\" {col}: No suspicious early dates\")\n",
|
|
1111
|
-
"
|
|
1115
|
+
"\n",
|
|
1112
1116
|
" # Sequence validation\n",
|
|
1113
1117
|
" if DATE_SEQUENCE and len(DATE_SEQUENCE) >= 2:\n",
|
|
1114
1118
|
" valid_sequence_cols = [c for c in DATE_SEQUENCE if c in date_cols]\n",
|
|
1115
1119
|
" if len(valid_sequence_cols) >= 2:\n",
|
|
1116
|
-
" print(
|
|
1120
|
+
" print(\"\\n🔗 DATE SEQUENCE VALIDATION:\")\n",
|
|
1117
1121
|
" print(f\" Expected order: {' ≤ '.join(valid_sequence_cols)}\")\n",
|
|
1118
|
-
"
|
|
1122
|
+
"\n",
|
|
1119
1123
|
" total_violations = 0\n",
|
|
1120
1124
|
" for i in range(len(valid_sequence_cols) - 1):\n",
|
|
1121
1125
|
" col1, col2 = valid_sequence_cols[i], valid_sequence_cols[i + 1]\n",
|
|
@@ -1123,13 +1127,13 @@
|
|
|
1123
1127
|
" mask = df_dates[col1].notna() & df_dates[col2].notna()\n",
|
|
1124
1128
|
" violations = (df_dates.loc[mask, col2] < df_dates.loc[mask, col1]).sum()\n",
|
|
1125
1129
|
" total_violations += violations\n",
|
|
1126
|
-
"
|
|
1130
|
+
"\n",
|
|
1127
1131
|
" if violations > 0:\n",
|
|
1128
1132
|
" pct = violations / mask.sum() * 100\n",
|
|
1129
1133
|
" print(f\" ⚠️ {col2} < {col1}: {violations:,} violations ({pct:.2f}%)\")\n",
|
|
1130
1134
|
" else:\n",
|
|
1131
1135
|
" print(f\" ✓ {col1} ≤ {col2}: No violations\")\n",
|
|
1132
|
-
"
|
|
1136
|
+
"\n",
|
|
1133
1137
|
" if total_violations == 0:\n",
|
|
1134
1138
|
" print(\"\\n ✅ All date sequences valid\")\n",
|
|
1135
1139
|
" else:\n",
|
|
@@ -1186,7 +1190,7 @@
|
|
|
1186
1190
|
},
|
|
1187
1191
|
"outputs": [],
|
|
1188
1192
|
"source": [
|
|
1189
|
-
"binary_cols = [name for name, col in findings.columns.items()
|
|
1193
|
+
"binary_cols = [name for name, col in findings.columns.items()\n",
|
|
1190
1194
|
" if col.inferred_type == ColumnType.BINARY\n",
|
|
1191
1195
|
" and name not in TEMPORAL_METADATA_COLS]\n",
|
|
1192
1196
|
"\n",
|
|
@@ -1203,7 +1207,7 @@
|
|
|
1203
1207
|
" count_0 = (df[col] == 0).sum()\n",
|
|
1204
1208
|
" count_1 = (df[col] == 1).sum()\n",
|
|
1205
1209
|
" total = count_0 + count_1\n",
|
|
1206
|
-
"
|
|
1210
|
+
"\n",
|
|
1207
1211
|
" binary_results.append({\n",
|
|
1208
1212
|
" 'column': col,\n",
|
|
1209
1213
|
" 'unique_values': unique_vals,\n",
|
|
@@ -1212,13 +1216,13 @@
|
|
|
1212
1216
|
" 'count_1': count_1,\n",
|
|
1213
1217
|
" 'pct_1': count_1 / total * 100 if total > 0 else 0\n",
|
|
1214
1218
|
" })\n",
|
|
1215
|
-
"
|
|
1219
|
+
"\n",
|
|
1216
1220
|
" status = \"✓\" if is_valid else \"⚠️\"\n",
|
|
1217
1221
|
" print(f\"\\n{status} {col}:\")\n",
|
|
1218
1222
|
" print(f\" Unique values: {unique_vals}\")\n",
|
|
1219
1223
|
" print(f\" 0 (No): {count_0:,} ({count_0/total*100:.1f}%)\")\n",
|
|
1220
1224
|
" print(f\" 1 (Yes): {count_1:,} ({count_1/total*100:.1f}%)\")\n",
|
|
1221
|
-
"
|
|
1225
|
+
"\n",
|
|
1222
1226
|
" if not is_valid:\n",
|
|
1223
1227
|
" invalid_vals = [v for v in unique_vals if v not in [0, 1, 0.0, 1.0]]\n",
|
|
1224
1228
|
" print(f\" ⚠️ Invalid values found: {invalid_vals}\")\n",
|
|
@@ -1226,7 +1230,7 @@
|
|
|
1226
1230
|
" if len(binary_cols) <= 6:\n",
|
|
1227
1231
|
" n_cols = len(binary_cols)\n",
|
|
1228
1232
|
" fig = make_subplots(rows=1, cols=n_cols, subplot_titles=binary_cols)\n",
|
|
1229
|
-
"
|
|
1233
|
+
"\n",
|
|
1230
1234
|
" for i, col in enumerate(binary_cols, 1):\n",
|
|
1231
1235
|
" counts = df[col].value_counts().sort_index()\n",
|
|
1232
1236
|
" fig.add_trace(\n",
|
|
@@ -1234,7 +1238,7 @@
|
|
|
1234
1238
|
" marker_color=['#d62728', '#2ca02c'], showlegend=False),\n",
|
|
1235
1239
|
" row=1, col=i\n",
|
|
1236
1240
|
" )\n",
|
|
1237
|
-
"
|
|
1241
|
+
"\n",
|
|
1238
1242
|
" fig.update_layout(height=350, title_text=\"Binary Field Distributions\",\n",
|
|
1239
1243
|
" template='plotly_white')\n",
|
|
1240
1244
|
" display_figure(fig)\n",
|
|
@@ -1295,7 +1299,7 @@
|
|
|
1295
1299
|
" if lower_val not in case_variants:\n",
|
|
1296
1300
|
" case_variants[lower_val] = []\n",
|
|
1297
1301
|
" case_variants[lower_val].append(val)\n",
|
|
1298
|
-
"
|
|
1302
|
+
"\n",
|
|
1299
1303
|
" for lower_val, variants in case_variants.items():\n",
|
|
1300
1304
|
" if len(variants) > 1:\n",
|
|
1301
1305
|
" consistency_issues.append({\n",
|
|
@@ -1308,7 +1312,7 @@
|
|
|
1308
1312
|
"if consistency_issues:\n",
|
|
1309
1313
|
" print(\"Data Consistency Issues:\")\n",
|
|
1310
1314
|
" display(pd.DataFrame([{k: v for k, v in issue.items() if k != \"variants\"} for issue in consistency_issues]))\n",
|
|
1311
|
-
"
|
|
1315
|
+
"\n",
|
|
1312
1316
|
" # Add consistency recommendations\n",
|
|
1313
1317
|
" for issue in consistency_issues:\n",
|
|
1314
1318
|
" registry.add_bronze_consistency(\n",
|
|
@@ -1378,7 +1382,7 @@
|
|
|
1378
1382
|
" high_severity = [r for r in cleaning_recs if r.severity == \"high\"]\n",
|
|
1379
1383
|
" medium_severity = [r for r in cleaning_recs if r.severity == \"medium\"]\n",
|
|
1380
1384
|
" low_severity = [r for r in cleaning_recs if r.severity == \"low\"]\n",
|
|
1381
|
-
"
|
|
1385
|
+
"\n",
|
|
1382
1386
|
" if high_severity:\n",
|
|
1383
1387
|
" print(\"\\n🔴 HIGH PRIORITY (must fix before modeling):\")\n",
|
|
1384
1388
|
" print(\"-\" * 60)\n",
|
|
@@ -1391,7 +1395,7 @@
|
|
|
1391
1395
|
" print(\" Action Steps:\")\n",
|
|
1392
1396
|
" for step in rec.action_steps:\n",
|
|
1393
1397
|
" print(f\" • {step}\")\n",
|
|
1394
|
-
"
|
|
1398
|
+
"\n",
|
|
1395
1399
|
" if medium_severity:\n",
|
|
1396
1400
|
" print(\"\\n🟡 MEDIUM PRIORITY (recommended fixes):\")\n",
|
|
1397
1401
|
" print(\"-\" * 60)\n",
|
|
@@ -1404,7 +1408,7 @@
|
|
|
1404
1408
|
" print(\" Action Steps:\")\n",
|
|
1405
1409
|
" for step in rec.action_steps:\n",
|
|
1406
1410
|
" print(f\" • {step}\")\n",
|
|
1407
|
-
"
|
|
1411
|
+
"\n",
|
|
1408
1412
|
" if low_severity:\n",
|
|
1409
1413
|
" print(\"\\n🟢 LOW PRIORITY (nice to have):\")\n",
|
|
1410
1414
|
" print(\"-\" * 60)\n",
|
|
@@ -1413,13 +1417,13 @@
|
|
|
1413
1417
|
" print(f\" Issue: {rec.description}\")\n",
|
|
1414
1418
|
" print(f\" Strategy: {rec.strategy_label}\")\n",
|
|
1415
1419
|
" print(f\" Impact: {rec.problem_impact}\")\n",
|
|
1416
|
-
"
|
|
1420
|
+
"\n",
|
|
1417
1421
|
" # Persist cleaning recommendations to registry\n",
|
|
1418
1422
|
" for rec in cleaning_recs:\n",
|
|
1419
1423
|
" # Check if not already added by previous sections\n",
|
|
1420
1424
|
" existing_null = [r for r in registry.bronze.null_handling if r.target_column == rec.column_name]\n",
|
|
1421
1425
|
" existing_outlier = [r for r in registry.bronze.outlier_handling if r.target_column == rec.column_name]\n",
|
|
1422
|
-
"
|
|
1426
|
+
"\n",
|
|
1423
1427
|
" if rec.issue_type in [\"null_values\", \"missing_values\"] and not existing_null:\n",
|
|
1424
1428
|
" strategy = \"median\" if \"median\" in rec.strategy.lower() else \"mode\" if \"mode\" in rec.strategy.lower() else \"drop\"\n",
|
|
1425
1429
|
" registry.add_bronze_null(\n",
|
|
@@ -1436,12 +1440,12 @@
|
|
|
1436
1440
|
" rationale=rec.description,\n",
|
|
1437
1441
|
" source_notebook=\"03_quality_assessment\"\n",
|
|
1438
1442
|
" )\n",
|
|
1439
|
-
"
|
|
1443
|
+
"\n",
|
|
1440
1444
|
" # Summary table\n",
|
|
1441
1445
|
" print(\"\\n\" + \"=\" * 80)\n",
|
|
1442
1446
|
" print(\"CLEANUP SUMMARY\")\n",
|
|
1443
1447
|
" print(\"=\" * 80)\n",
|
|
1444
|
-
"
|
|
1448
|
+
"\n",
|
|
1445
1449
|
" summary_data = []\n",
|
|
1446
1450
|
" for rec in cleaning_recs:\n",
|
|
1447
1451
|
" summary_data.append({\n",
|
|
@@ -1451,10 +1455,10 @@
|
|
|
1451
1455
|
" \"Recommended Action\": rec.strategy_label,\n",
|
|
1452
1456
|
" \"Affected Rows\": f\"{rec.affected_rows:,}\"\n",
|
|
1453
1457
|
" })\n",
|
|
1454
|
-
"
|
|
1458
|
+
"\n",
|
|
1455
1459
|
" summary_df = pd.DataFrame(summary_data)\n",
|
|
1456
1460
|
" display(summary_df)\n",
|
|
1457
|
-
"
|
|
1461
|
+
"\n",
|
|
1458
1462
|
" # Total impact\n",
|
|
1459
1463
|
" total_affected = sum(r.affected_rows for r in cleaning_recs)\n",
|
|
1460
1464
|
" unique_affected = min(total_affected, len(df)) # Can't exceed total rows\n",
|
|
@@ -1513,7 +1517,7 @@
|
|
|
1513
1517
|
"\n",
|
|
1514
1518
|
"# Summary of recommendations\n",
|
|
1515
1519
|
"all_recs = registry.all_recommendations\n",
|
|
1516
|
-
"print(
|
|
1520
|
+
"print(\"\\n📋 Recommendations Summary:\")\n",
|
|
1517
1521
|
"print(f\" Bronze layer: {len(registry.get_by_layer('bronze'))} recommendations\")\n",
|
|
1518
1522
|
"if registry.silver:\n",
|
|
1519
1523
|
" print(f\" Silver layer: {len(registry.get_by_layer('silver'))} recommendations\")\n",
|