churnkit 0.76.1a1__py3-none-any.whl → 0.76.1a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +10 -5
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +6 -6
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +52 -46
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +68 -65
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +12 -27
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +216 -221
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +88 -81
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +111 -108
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +44 -38
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +89 -85
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +81 -80
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +83 -89
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +102 -98
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +32 -31
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +33 -29
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +6 -5
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +67 -63
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +38 -23
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +3 -1
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/METADATA +1 -1
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/RECORD +30 -30
- customer_retention/__init__.py +1 -1
- customer_retention/analysis/auto_explorer/explorer.py +2 -2
- customer_retention/analysis/notebook_progress.py +4 -1
- customer_retention/core/compat/__init__.py +10 -0
- customer_retention/integrations/databricks_init.py +13 -0
- customer_retention/stages/profiling/column_profiler.py +9 -2
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/WHEEL +0 -0
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/entry_points.txt +0 -0
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -82,18 +82,19 @@
|
|
|
82
82
|
"outputs": [],
|
|
83
83
|
"source": [
|
|
84
84
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
85
|
+
"\n",
|
|
85
86
|
"track_and_export_previous(\"07_modeling_readiness.ipynb\")\n",
|
|
86
87
|
"\n",
|
|
87
|
-
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
88
|
-
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
89
|
-
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
90
|
-
"from customer_retention.stages.modeling import ImbalanceRecommender, ImbalanceHandler, ImbalanceStrategy\n",
|
|
91
88
|
"import pandas as pd\n",
|
|
92
|
-
"import numpy as np\n",
|
|
93
89
|
"import plotly.graph_objects as go\n",
|
|
94
|
-
"
|
|
95
|
-
"from
|
|
96
|
-
"from customer_retention.
|
|
90
|
+
"\n",
|
|
91
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
92
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure\n",
|
|
93
|
+
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
94
|
+
"from customer_retention.core.config.experiments import (\n",
|
|
95
|
+
" FINDINGS_DIR,\n",
|
|
96
|
+
")\n",
|
|
97
|
+
"from customer_retention.stages.modeling import ImbalanceRecommender"
|
|
97
98
|
]
|
|
98
99
|
},
|
|
99
100
|
{
|
|
@@ -150,7 +151,7 @@
|
|
|
150
151
|
"findings = ExplorationFindings.load(FINDINGS_PATH)\n",
|
|
151
152
|
"\n",
|
|
152
153
|
"# Load data - handle aggregated vs standard paths\n",
|
|
153
|
-
"from customer_retention.stages.temporal import
|
|
154
|
+
"from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
|
|
154
155
|
"\n",
|
|
155
156
|
"# For aggregated data, load directly from the parquet source\n",
|
|
156
157
|
"if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
|
|
@@ -217,11 +218,11 @@
|
|
|
217
218
|
"has_target = findings.target_column is not None\n",
|
|
218
219
|
"checklist.append({\"Check\": \"Target column identified\", \"Status\": \"Pass\" if has_target else \"Fail\"})\n",
|
|
219
220
|
"\n",
|
|
220
|
-
"has_features = len([c for c in findings.columns.values()
|
|
221
|
+
"has_features = len([c for c in findings.columns.values()\n",
|
|
221
222
|
" if c.inferred_type not in [ColumnType.IDENTIFIER, ColumnType.TARGET]]) > 0\n",
|
|
222
223
|
"checklist.append({\"Check\": \"Feature columns available\", \"Status\": \"Pass\" if has_features else \"Fail\"})\n",
|
|
223
224
|
"\n",
|
|
224
|
-
"high_missing = any(c.universal_metrics.get(\"null_percentage\", 0) > 50
|
|
225
|
+
"high_missing = any(c.universal_metrics.get(\"null_percentage\", 0) > 50\n",
|
|
225
226
|
" for c in findings.columns.values())\n",
|
|
226
227
|
"checklist.append({\"Check\": \"No columns with >50% missing\", \"Status\": \"Fail\" if high_missing else \"Pass\"})\n",
|
|
227
228
|
"\n",
|
|
@@ -289,31 +290,31 @@
|
|
|
289
290
|
"if findings.target_column:\n",
|
|
290
291
|
" target = findings.target_column\n",
|
|
291
292
|
" target_series = df[target]\n",
|
|
292
|
-
"
|
|
293
|
+
"\n",
|
|
293
294
|
" print(\"=\" * 70)\n",
|
|
294
295
|
" print(\"CLASS IMBALANCE ANALYSIS\")\n",
|
|
295
296
|
" print(\"=\" * 70)\n",
|
|
296
|
-
"
|
|
297
|
+
"\n",
|
|
297
298
|
" print(f\"\\nTarget Column: {target}\")\n",
|
|
298
299
|
" print(f\"Target Type: {findings.target_type}\")\n",
|
|
299
300
|
" print(f\"Missing Values: {target_series.isnull().sum()}\")\n",
|
|
300
|
-
"
|
|
301
|
+
"\n",
|
|
301
302
|
" if findings.target_type == \"binary\":\n",
|
|
302
303
|
" value_counts = target_series.value_counts()\n",
|
|
303
304
|
" majority_class = value_counts.idxmax()\n",
|
|
304
305
|
" minority_class = value_counts.idxmin()\n",
|
|
305
306
|
" majority_count = value_counts.max()\n",
|
|
306
307
|
" minority_count = value_counts.min()\n",
|
|
307
|
-
"
|
|
308
|
-
" print(
|
|
308
|
+
"\n",
|
|
309
|
+
" print(\"\\n📊 CLASS DISTRIBUTION:\")\n",
|
|
309
310
|
" print(f\" Majority Class ({majority_class}): {majority_count:,} ({majority_count/len(df)*100:.1f}%)\")\n",
|
|
310
311
|
" print(f\" Minority Class ({minority_class}): {minority_count:,} ({minority_count/len(df)*100:.1f}%)\")\n",
|
|
311
|
-
"
|
|
312
|
+
"\n",
|
|
312
313
|
" # Use framework recommender for strategy recommendations\n",
|
|
313
314
|
" recommender = ImbalanceRecommender()\n",
|
|
314
315
|
" rec = recommender.recommend(target_series, n_samples=len(df))\n",
|
|
315
316
|
" rec.print_recommendation()\n",
|
|
316
|
-
"
|
|
317
|
+
"\n",
|
|
317
318
|
" # Visualize\n",
|
|
318
319
|
" severity_colors = {\"low\": \"#2ca02c\", \"moderate\": \"#ffbb00\", \"high\": \"#ff7f0e\", \"severe\": \"#d62728\"}\n",
|
|
319
320
|
" fig = go.Figure(go.Bar(\n",
|
|
@@ -330,14 +331,14 @@
|
|
|
330
331
|
" template='plotly_white', height=400\n",
|
|
331
332
|
" )\n",
|
|
332
333
|
" display_figure(fig)\n",
|
|
333
|
-
"
|
|
334
|
+
"\n",
|
|
334
335
|
" # Show sklearn class weights\n",
|
|
335
|
-
" print(
|
|
336
|
+
" print(\"\\n💡 SKLEARN CLASS WEIGHTS:\")\n",
|
|
336
337
|
" weight_minority = len(df) / (2 * minority_count)\n",
|
|
337
338
|
" weight_majority = len(df) / (2 * majority_count)\n",
|
|
338
339
|
" print(f\" class_weight={{0: {weight_majority:.3f}, 1: {weight_minority:.3f}}}\")\n",
|
|
339
|
-
" print(
|
|
340
|
-
"
|
|
340
|
+
" print(\" Or use class_weight='balanced'\")\n",
|
|
341
|
+
"\n",
|
|
341
342
|
" # Store recommendation for later use\n",
|
|
342
343
|
" imbalance_recommendation = rec\n",
|
|
343
344
|
"else:\n",
|
|
@@ -397,11 +398,11 @@
|
|
|
397
398
|
"\n",
|
|
398
399
|
"if findings.target_column:\n",
|
|
399
400
|
" target = findings.target_column\n",
|
|
400
|
-
"
|
|
401
|
+
"\n",
|
|
401
402
|
" for col_name, col_info in findings.columns.items():\n",
|
|
402
403
|
" if col_name == target or col_info.inferred_type == ColumnType.IDENTIFIER or col_name in TEMPORAL_METADATA_COLS:\n",
|
|
403
404
|
" continue\n",
|
|
404
|
-
"
|
|
405
|
+
"\n",
|
|
405
406
|
" if col_info.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:\n",
|
|
406
407
|
" corr = df[[col_name, target]].corr().iloc[0, 1]\n",
|
|
407
408
|
" if abs(corr) > 0.9:\n",
|
|
@@ -410,7 +411,7 @@
|
|
|
410
411
|
" \"Risk\": \"High\",\n",
|
|
411
412
|
" \"Reason\": f\"Very high correlation ({corr:.3f}) - potential leakage\"\n",
|
|
412
413
|
" })\n",
|
|
413
|
-
"
|
|
414
|
+
"\n",
|
|
414
415
|
" if any(kw in col_name.lower() for kw in ['future', 'outcome', 'result', 'after']):\n",
|
|
415
416
|
" leakage_risks.append({\n",
|
|
416
417
|
" \"Column\": col_name,\n",
|
|
@@ -473,7 +474,7 @@
|
|
|
473
474
|
"for col_type, count in sorted(type_summary.items()):\n",
|
|
474
475
|
" print(f\" {col_type}: {count}\")\n",
|
|
475
476
|
"\n",
|
|
476
|
-
"usable_features = sum(1 for c in findings.columns.values()
|
|
477
|
+
"usable_features = sum(1 for c in findings.columns.values()\n",
|
|
477
478
|
" if c.inferred_type not in [ColumnType.IDENTIFIER, ColumnType.TARGET])\n",
|
|
478
479
|
"print(f\"\\nUsable features for modeling: {usable_features}\")"
|
|
479
480
|
]
|
|
@@ -593,10 +594,10 @@
|
|
|
593
594
|
" print(f\" • {col} ({info.availability_type}, {info.coverage_pct:.0f}% coverage)\")\n",
|
|
594
595
|
" if len(findings.problematic_availability_columns) > 10:\n",
|
|
595
596
|
" print(f\" ... and {len(findings.problematic_availability_columns) - 10} more\")\n",
|
|
596
|
-
"
|
|
597
|
+
"\n",
|
|
597
598
|
" action = findings.metadata.get(\"availability_action\", \"exclude\")\n",
|
|
598
599
|
" print(f\"\\n📋 Action: {action.upper()}\")\n",
|
|
599
|
-
" print(
|
|
600
|
+
" print(\" These features will be excluded in notebook 08.\")\n",
|
|
600
601
|
"else:\n",
|
|
601
602
|
" print(\"\\n✅ All features have full temporal coverage.\")"
|
|
602
603
|
]
|
|
@@ -700,15 +701,15 @@
|
|
|
700
701
|
"if 'X' in dir() and 'y' in dir():\n",
|
|
701
702
|
" detector = LeakageDetector()\n",
|
|
702
703
|
" result = detector.run_all_checks(X, y, include_pit=False)\n",
|
|
703
|
-
"
|
|
704
|
+
"\n",
|
|
704
705
|
" print(\"=\" * 70)\n",
|
|
705
706
|
" print(\"FINAL LEAKAGE VALIDATION\")\n",
|
|
706
707
|
" print(\"=\" * 70)\n",
|
|
707
|
-
"
|
|
708
|
+
"\n",
|
|
708
709
|
" if result.passed:\n",
|
|
709
710
|
" print(\"\\n✅ PASSED: No critical leakage issues\")\n",
|
|
710
711
|
" print(f\" Checks run: {len(result.checks)}\")\n",
|
|
711
|
-
"
|
|
712
|
+
"\n",
|
|
712
713
|
" # Show warnings for HIGH severity issues\n",
|
|
713
714
|
" high_issues = [c for c in result.checks if c.severity.value == 'high']\n",
|
|
714
715
|
" if high_issues:\n",
|
|
@@ -83,25 +83,29 @@
|
|
|
83
83
|
"outputs": [],
|
|
84
84
|
"source": [
|
|
85
85
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
86
|
+
"\n",
|
|
86
87
|
"track_and_export_previous(\"08_baseline_experiments.ipynb\")\n",
|
|
87
88
|
"\n",
|
|
88
|
-
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
89
|
-
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
90
|
-
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
91
89
|
"import pandas as pd\n",
|
|
92
|
-
"import
|
|
93
|
-
"\n",
|
|
94
|
-
"from sklearn.model_selection import train_test_split, cross_val_score\n",
|
|
95
|
-
"from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
|
|
96
|
-
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
|
|
90
|
+
"from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier\n",
|
|
97
91
|
"from sklearn.linear_model import LogisticRegression\n",
|
|
98
|
-
"from sklearn.metrics import (
|
|
99
|
-
"
|
|
100
|
-
"
|
|
92
|
+
"from sklearn.metrics import (\n",
|
|
93
|
+
" average_precision_score,\n",
|
|
94
|
+
" classification_report,\n",
|
|
95
|
+
" f1_score,\n",
|
|
96
|
+
" precision_score,\n",
|
|
97
|
+
" recall_score,\n",
|
|
98
|
+
" roc_auc_score,\n",
|
|
99
|
+
")\n",
|
|
100
|
+
"from sklearn.model_selection import cross_val_score, train_test_split\n",
|
|
101
|
+
"from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
|
|
101
102
|
"\n",
|
|
102
|
-
"
|
|
103
|
-
"from
|
|
104
|
-
"from customer_retention.core.config.
|
|
103
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
104
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
105
|
+
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
106
|
+
"from customer_retention.core.config.experiments import (\n",
|
|
107
|
+
" FINDINGS_DIR,\n",
|
|
108
|
+
")"
|
|
105
109
|
]
|
|
106
110
|
},
|
|
107
111
|
{
|
|
@@ -158,7 +162,7 @@
|
|
|
158
162
|
"findings = ExplorationFindings.load(FINDINGS_PATH)\n",
|
|
159
163
|
"\n",
|
|
160
164
|
"# Load data - handle aggregated vs standard paths\n",
|
|
161
|
-
"from customer_retention.stages.temporal import
|
|
165
|
+
"from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
|
|
162
166
|
"\n",
|
|
163
167
|
"# For aggregated data, load directly from the parquet source\n",
|
|
164
168
|
"if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
|
|
@@ -269,7 +273,7 @@
|
|
|
269
273
|
" print(f\" {col_type}: {count}\")\n",
|
|
270
274
|
"\n",
|
|
271
275
|
"# Show excluded columns\n",
|
|
272
|
-
"excluded = [name for name, col in findings.columns.items()
|
|
276
|
+
"excluded = [name for name, col in findings.columns.items()\n",
|
|
273
277
|
" if col.inferred_type in [ColumnType.IDENTIFIER, ColumnType.TARGET, ColumnType.TEXT]]\n",
|
|
274
278
|
"if excluded:\n",
|
|
275
279
|
" print(f\"\\n⛔ Excluded Columns: {', '.join(excluded)}\")"
|
|
@@ -309,19 +313,19 @@
|
|
|
309
313
|
" selector = FeatureSelector(target_column=findings.target_column)\n",
|
|
310
314
|
" availability_recs = selector.get_availability_recommendations(findings.feature_availability)\n",
|
|
311
315
|
" unavailable_features = [rec.column for rec in availability_recs]\n",
|
|
312
|
-
"
|
|
316
|
+
"\n",
|
|
313
317
|
" print(f\"\\n⚠️ {len(availability_recs)} feature(s) have availability issues:\\n\")\n",
|
|
314
318
|
" for rec in availability_recs:\n",
|
|
315
319
|
" print(f\" • {rec.column} ({rec.issue_type}, {rec.coverage_pct:.0f}% coverage)\")\n",
|
|
316
|
-
"
|
|
320
|
+
"\n",
|
|
317
321
|
" print(\"\\n📋 Alternative approaches (for investigation):\")\n",
|
|
318
322
|
" print(\" • segment_by_cohort: Train separate models per availability period\")\n",
|
|
319
323
|
" print(\" • add_indicator: Create availability flags and impute missing\")\n",
|
|
320
324
|
" print(\" • filter_window: Restrict data to feature's available period\")\n",
|
|
321
|
-
"
|
|
325
|
+
"\n",
|
|
322
326
|
" original_count = len(feature_cols)\n",
|
|
323
327
|
" feature_cols = [f for f in feature_cols if f not in unavailable_features]\n",
|
|
324
|
-
"
|
|
328
|
+
"\n",
|
|
325
329
|
" print(f\"\\n🗑️ Removed {original_count - len(feature_cols)} unavailable features\")\n",
|
|
326
330
|
" print(f\"📊 Features remaining: {len(feature_cols)}\")\n",
|
|
327
331
|
"else:\n",
|
|
@@ -377,7 +381,7 @@
|
|
|
377
381
|
"\n",
|
|
378
382
|
"print(f\"Train size: {len(X_train):,} ({len(X_train)/len(X)*100:.0f}%)\")\n",
|
|
379
383
|
"print(f\"Test size: {len(X_test):,} ({len(X_test)/len(X)*100:.0f}%)\")\n",
|
|
380
|
-
"print(
|
|
384
|
+
"print(\"\\nTrain class distribution:\")\n",
|
|
381
385
|
"print(f\" Retained (1): {(y_train == 1).sum():,} ({(y_train == 1).sum()/len(y_train)*100:.1f}%)\")\n",
|
|
382
386
|
"print(f\" Churned (0): {(y_train == 0).sum():,} ({(y_train == 0).sum()/len(y_train)*100:.1f}%)\")"
|
|
383
387
|
]
|
|
@@ -438,7 +442,7 @@
|
|
|
438
442
|
"\n",
|
|
439
443
|
"for name, model in models.items():\n",
|
|
440
444
|
" print(f\"Training {name}...\")\n",
|
|
441
|
-
"
|
|
445
|
+
"\n",
|
|
442
446
|
" # Use scaled data for Logistic Regression, unscaled for tree-based\n",
|
|
443
447
|
" if \"Logistic\" in name:\n",
|
|
444
448
|
" model.fit(X_train_scaled, y_train)\n",
|
|
@@ -448,20 +452,20 @@
|
|
|
448
452
|
" model.fit(X_train, y_train)\n",
|
|
449
453
|
" y_pred = model.predict(X_test)\n",
|
|
450
454
|
" y_pred_proba = model.predict_proba(X_test)[:, 1]\n",
|
|
451
|
-
"
|
|
455
|
+
"\n",
|
|
452
456
|
" # Calculate metrics\n",
|
|
453
457
|
" auc = roc_auc_score(y_test, y_pred_proba)\n",
|
|
454
458
|
" pr_auc = average_precision_score(y_test, y_pred_proba)\n",
|
|
455
459
|
" f1 = f1_score(y_test, y_pred)\n",
|
|
456
460
|
" precision = precision_score(y_test, y_pred)\n",
|
|
457
461
|
" recall = recall_score(y_test, y_pred)\n",
|
|
458
|
-
"
|
|
462
|
+
"\n",
|
|
459
463
|
" # Cross-validation\n",
|
|
460
464
|
" if \"Logistic\" in name:\n",
|
|
461
465
|
" cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='roc_auc')\n",
|
|
462
466
|
" else:\n",
|
|
463
467
|
" cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')\n",
|
|
464
|
-
"
|
|
468
|
+
"\n",
|
|
465
469
|
" results.append({\n",
|
|
466
470
|
" \"Model\": name,\n",
|
|
467
471
|
" \"Test AUC\": auc,\n",
|
|
@@ -472,7 +476,7 @@
|
|
|
472
476
|
" \"CV AUC Mean\": cv_scores.mean(),\n",
|
|
473
477
|
" \"CV AUC Std\": cv_scores.std()\n",
|
|
474
478
|
" })\n",
|
|
475
|
-
"
|
|
479
|
+
"\n",
|
|
476
480
|
" model_predictions[name] = {\n",
|
|
477
481
|
" 'y_pred': y_pred,\n",
|
|
478
482
|
" 'y_pred_proba': y_pred_proba,\n",
|
|
@@ -799,12 +803,12 @@
|
|
|
799
803
|
"print(f\" PR-AUC: {best_model['PR-AUC']:.4f}\")\n",
|
|
800
804
|
"print(f\" F1-Score: {best_model['F1-Score']:.4f}\")\n",
|
|
801
805
|
"\n",
|
|
802
|
-
"print(
|
|
806
|
+
"print(\"\\n📊 TOP 3 IMPORTANT FEATURES:\")\n",
|
|
803
807
|
"for i, feat in enumerate(importance_df.head(3)['Feature'].tolist(), 1):\n",
|
|
804
808
|
" imp = importance_df[importance_df['Feature'] == feat]['Importance'].values[0]\n",
|
|
805
809
|
" print(f\" {i}. {feat} ({imp:.3f})\")\n",
|
|
806
810
|
"\n",
|
|
807
|
-
"print(
|
|
811
|
+
"print(\"\\n📈 MODEL PERFORMANCE ASSESSMENT:\")\n",
|
|
808
812
|
"if best_model['Test AUC'] > 0.90:\n",
|
|
809
813
|
" print(\" Excellent predictive signal - likely production-ready with tuning\")\n",
|
|
810
814
|
"elif best_model['Test AUC'] > 0.80:\n",
|
|
@@ -814,7 +818,7 @@
|
|
|
814
818
|
"else:\n",
|
|
815
819
|
" print(\" Weak signal - may need more data or different features\")\n",
|
|
816
820
|
"\n",
|
|
817
|
-
"print(
|
|
821
|
+
"print(\"\\n💡 NEXT STEPS:\")\n",
|
|
818
822
|
"print(\" 1. Feature engineering with derived features (notebook 05)\")\n",
|
|
819
823
|
"print(\" 2. Hyperparameter tuning (GridSearchCV)\")\n",
|
|
820
824
|
"print(\" 3. Threshold optimization for business metrics\")\n",
|
|
@@ -66,13 +66,15 @@
|
|
|
66
66
|
"outputs": [],
|
|
67
67
|
"source": [
|
|
68
68
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
69
|
+
"\n",
|
|
69
70
|
"track_and_export_previous(\"09_business_alignment.ipynb\")\n",
|
|
70
71
|
"\n",
|
|
71
|
-
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
72
|
-
"from customer_retention.analysis.visualization import display_table\n",
|
|
73
72
|
"import pandas as pd\n",
|
|
74
|
-
"
|
|
75
|
-
"from customer_retention.
|
|
73
|
+
"\n",
|
|
74
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
75
|
+
"from customer_retention.core.config.experiments import (\n",
|
|
76
|
+
" FINDINGS_DIR,\n",
|
|
77
|
+
")\n"
|
|
76
78
|
]
|
|
77
79
|
},
|
|
78
80
|
{
|
|
@@ -98,7 +100,6 @@
|
|
|
98
100
|
"outputs": [],
|
|
99
101
|
"source": [
|
|
100
102
|
"# === CONFIGURATION ===\n",
|
|
101
|
-
"from pathlib import Path\n",
|
|
102
103
|
"\n",
|
|
103
104
|
"# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
|
|
104
105
|
"\n",
|