churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,780 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "aed6419c",
|
|
6
|
+
"metadata": {
|
|
7
|
+
"papermill": {
|
|
8
|
+
"duration": 0.002447,
|
|
9
|
+
"end_time": "2026-02-02T13:03:35.667377",
|
|
10
|
+
"exception": false,
|
|
11
|
+
"start_time": "2026-02-02T13:03:35.664930",
|
|
12
|
+
"status": "completed"
|
|
13
|
+
},
|
|
14
|
+
"tags": []
|
|
15
|
+
},
|
|
16
|
+
"source": [
|
|
17
|
+
"# Chapter 7: Modeling Readiness\n",
|
|
18
|
+
"\n",
|
|
19
|
+
"**Purpose:** Validate that data is ready for machine learning and identify potential pitfalls.\n",
|
|
20
|
+
"\n",
|
|
21
|
+
"**What you'll learn:**\n",
|
|
22
|
+
"- How to assess if your data is ready for modeling\n",
|
|
23
|
+
"- How to detect potential data leakage before it ruins your model\n",
|
|
24
|
+
"- How to handle class imbalance in binary classification\n",
|
|
25
|
+
"- What preprocessing steps are needed before training\n",
|
|
26
|
+
"\n",
|
|
27
|
+
"**Outputs:**\n",
|
|
28
|
+
"- Pre-modeling validation checklist with pass/fail status\n",
|
|
29
|
+
"- Detailed leakage risk assessment (including temporal leakage)\n",
|
|
30
|
+
"- Class imbalance analysis with strategy recommendations\n",
|
|
31
|
+
"- Overall readiness score\n",
|
|
32
|
+
"\n",
|
|
33
|
+
"---\n",
|
|
34
|
+
"\n",
|
|
35
|
+
"## Why Modeling Readiness Matters\n",
|
|
36
|
+
"\n",
|
|
37
|
+
"| Issue | Impact | Detection |\n",
|
|
38
|
+
"|-------|--------|-----------|\n",
|
|
39
|
+
"| **Data Leakage** | Overly optimistic results that fail in production | High correlation with target, future information |\n",
|
|
40
|
+
"| **Class Imbalance** | Model ignores minority class | Ratio > 3:1 between classes |\n",
|
|
41
|
+
"| **Missing Values** | Model failures or bias | >50% missing in any column |\n",
|
|
42
|
+
"| **Insufficient Data** | Overfitting, poor generalization | <100 rows for simple models |"
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"cell_type": "markdown",
|
|
47
|
+
"id": "449360f6",
|
|
48
|
+
"metadata": {
|
|
49
|
+
"papermill": {
|
|
50
|
+
"duration": 0.002167,
|
|
51
|
+
"end_time": "2026-02-02T13:03:35.671795",
|
|
52
|
+
"exception": false,
|
|
53
|
+
"start_time": "2026-02-02T13:03:35.669628",
|
|
54
|
+
"status": "completed"
|
|
55
|
+
},
|
|
56
|
+
"tags": []
|
|
57
|
+
},
|
|
58
|
+
"source": [
|
|
59
|
+
"## 7.1 Setup"
|
|
60
|
+
]
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"cell_type": "code",
|
|
64
|
+
"execution_count": null,
|
|
65
|
+
"id": "f691e37e",
|
|
66
|
+
"metadata": {
|
|
67
|
+
"execution": {
|
|
68
|
+
"iopub.execute_input": "2026-02-02T13:03:35.676385Z",
|
|
69
|
+
"iopub.status.busy": "2026-02-02T13:03:35.676255Z",
|
|
70
|
+
"iopub.status.idle": "2026-02-02T13:03:37.921967Z",
|
|
71
|
+
"shell.execute_reply": "2026-02-02T13:03:37.921217Z"
|
|
72
|
+
},
|
|
73
|
+
"papermill": {
|
|
74
|
+
"duration": 2.248892,
|
|
75
|
+
"end_time": "2026-02-02T13:03:37.922664",
|
|
76
|
+
"exception": false,
|
|
77
|
+
"start_time": "2026-02-02T13:03:35.673772",
|
|
78
|
+
"status": "completed"
|
|
79
|
+
},
|
|
80
|
+
"tags": []
|
|
81
|
+
},
|
|
82
|
+
"outputs": [],
|
|
83
|
+
"source": [
|
|
84
|
+
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
85
|
+
"track_and_export_previous(\"07_modeling_readiness.ipynb\")\n",
|
|
86
|
+
"\n",
|
|
87
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
88
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
89
|
+
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
90
|
+
"from customer_retention.stages.modeling import ImbalanceRecommender, ImbalanceHandler, ImbalanceStrategy\n",
|
|
91
|
+
"import pandas as pd\n",
|
|
92
|
+
"import numpy as np\n",
|
|
93
|
+
"import plotly.graph_objects as go\n",
|
|
94
|
+
"import plotly.express as px\n",
|
|
95
|
+
"from plotly.subplots import make_subplots\n",
|
|
96
|
+
"from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure"
|
|
97
|
+
]
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
"cell_type": "code",
|
|
101
|
+
"execution_count": null,
|
|
102
|
+
"id": "5a3b408e",
|
|
103
|
+
"metadata": {
|
|
104
|
+
"execution": {
|
|
105
|
+
"iopub.execute_input": "2026-02-02T13:03:37.927421Z",
|
|
106
|
+
"iopub.status.busy": "2026-02-02T13:03:37.927148Z",
|
|
107
|
+
"iopub.status.idle": "2026-02-02T13:03:38.207116Z",
|
|
108
|
+
"shell.execute_reply": "2026-02-02T13:03:38.206590Z"
|
|
109
|
+
},
|
|
110
|
+
"papermill": {
|
|
111
|
+
"duration": 0.283322,
|
|
112
|
+
"end_time": "2026-02-02T13:03:38.207772",
|
|
113
|
+
"exception": false,
|
|
114
|
+
"start_time": "2026-02-02T13:03:37.924450",
|
|
115
|
+
"status": "completed"
|
|
116
|
+
},
|
|
117
|
+
"tags": []
|
|
118
|
+
},
|
|
119
|
+
"outputs": [],
|
|
120
|
+
"source": [
|
|
121
|
+
"# === CONFIGURATION ===\n",
|
|
122
|
+
"from pathlib import Path\n",
|
|
123
|
+
"\n",
|
|
124
|
+
"# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
|
|
125
|
+
"\n",
|
|
126
|
+
"findings_files = [f for f in FINDINGS_DIR.glob(\"*_findings.yaml\") if \"multi_dataset\" not in f.name]\n",
|
|
127
|
+
"if not findings_files:\n",
|
|
128
|
+
" raise FileNotFoundError(f\"No findings files found in {FINDINGS_DIR}. Run notebook 01 first.\")\n",
|
|
129
|
+
"\n",
|
|
130
|
+
"# Prefer aggregated findings (from 01d) over event-level findings\n",
|
|
131
|
+
"# Pattern: *_aggregated* in filename indicates aggregated data\n",
|
|
132
|
+
"aggregated_files = [f for f in findings_files if \"_aggregated\" in f.name]\n",
|
|
133
|
+
"non_aggregated_files = [f for f in findings_files if \"_aggregated\" not in f.name]\n",
|
|
134
|
+
"\n",
|
|
135
|
+
"if aggregated_files:\n",
|
|
136
|
+
" # Use most recent aggregated file\n",
|
|
137
|
+
" aggregated_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
138
|
+
" FINDINGS_PATH = str(aggregated_files[0])\n",
|
|
139
|
+
" print(f\"Found {len(aggregated_files)} aggregated findings file(s)\")\n",
|
|
140
|
+
" print(f\"Using: {FINDINGS_PATH}\")\n",
|
|
141
|
+
" if non_aggregated_files:\n",
|
|
142
|
+
" print(f\" (Skipping {len(non_aggregated_files)} event-level findings)\")\n",
|
|
143
|
+
"else:\n",
|
|
144
|
+
" # Fall back to most recent non-aggregated file\n",
|
|
145
|
+
" non_aggregated_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
146
|
+
" FINDINGS_PATH = str(non_aggregated_files[0])\n",
|
|
147
|
+
" print(f\"Found {len(findings_files)} findings file(s)\")\n",
|
|
148
|
+
" print(f\"Using: {FINDINGS_PATH}\")\n",
|
|
149
|
+
"\n",
|
|
150
|
+
"findings = ExplorationFindings.load(FINDINGS_PATH)\n",
|
|
151
|
+
"\n",
|
|
152
|
+
"# Load data - handle aggregated vs standard paths\n",
|
|
153
|
+
"from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
|
|
154
|
+
"\n",
|
|
155
|
+
"# For aggregated data, load directly from the parquet source\n",
|
|
156
|
+
"if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
|
|
157
|
+
" source_path = Path(findings.source_path)\n",
|
|
158
|
+
" # Handle relative path from notebook directory\n",
|
|
159
|
+
" if not source_path.is_absolute():\n",
|
|
160
|
+
" # The source_path in findings is relative to project root\n",
|
|
161
|
+
" if str(source_path).startswith(\"experiments\"):\n",
|
|
162
|
+
" source_path = Path(\"..\") / source_path\n",
|
|
163
|
+
" else:\n",
|
|
164
|
+
" source_path = FINDINGS_DIR / source_path.name\n",
|
|
165
|
+
" df = pd.read_parquet(source_path)\n",
|
|
166
|
+
" data_source = f\"aggregated:{source_path.name}\"\n",
|
|
167
|
+
"else:\n",
|
|
168
|
+
" # Standard loading for event-level or entity-level data\n",
|
|
169
|
+
" df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
|
|
170
|
+
"\n",
|
|
171
|
+
"charts = ChartBuilder()\n",
|
|
172
|
+
"\n",
|
|
173
|
+
"print(f\"\\nLoaded {len(df):,} rows from: {data_source}\")"
|
|
174
|
+
]
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
"cell_type": "markdown",
|
|
178
|
+
"id": "b06f1920",
|
|
179
|
+
"metadata": {
|
|
180
|
+
"papermill": {
|
|
181
|
+
"duration": 0.002205,
|
|
182
|
+
"end_time": "2026-02-02T13:03:38.211956",
|
|
183
|
+
"exception": false,
|
|
184
|
+
"start_time": "2026-02-02T13:03:38.209751",
|
|
185
|
+
"status": "completed"
|
|
186
|
+
},
|
|
187
|
+
"tags": []
|
|
188
|
+
},
|
|
189
|
+
"source": [
|
|
190
|
+
"## 7.2 Modeling Readiness Checklist"
|
|
191
|
+
]
|
|
192
|
+
},
|
|
193
|
+
{
|
|
194
|
+
"cell_type": "code",
|
|
195
|
+
"execution_count": null,
|
|
196
|
+
"id": "5e2c1373",
|
|
197
|
+
"metadata": {
|
|
198
|
+
"execution": {
|
|
199
|
+
"iopub.execute_input": "2026-02-02T13:03:38.216137Z",
|
|
200
|
+
"iopub.status.busy": "2026-02-02T13:03:38.216008Z",
|
|
201
|
+
"iopub.status.idle": "2026-02-02T13:03:38.223092Z",
|
|
202
|
+
"shell.execute_reply": "2026-02-02T13:03:38.222694Z"
|
|
203
|
+
},
|
|
204
|
+
"papermill": {
|
|
205
|
+
"duration": 0.00999,
|
|
206
|
+
"end_time": "2026-02-02T13:03:38.223767",
|
|
207
|
+
"exception": false,
|
|
208
|
+
"start_time": "2026-02-02T13:03:38.213777",
|
|
209
|
+
"status": "completed"
|
|
210
|
+
},
|
|
211
|
+
"tags": []
|
|
212
|
+
},
|
|
213
|
+
"outputs": [],
|
|
214
|
+
"source": [
|
|
215
|
+
"checklist = []\n",
|
|
216
|
+
"\n",
|
|
217
|
+
"has_target = findings.target_column is not None\n",
|
|
218
|
+
"checklist.append({\"Check\": \"Target column identified\", \"Status\": \"Pass\" if has_target else \"Fail\"})\n",
|
|
219
|
+
"\n",
|
|
220
|
+
"has_features = len([c for c in findings.columns.values() \n",
|
|
221
|
+
" if c.inferred_type not in [ColumnType.IDENTIFIER, ColumnType.TARGET]]) > 0\n",
|
|
222
|
+
"checklist.append({\"Check\": \"Feature columns available\", \"Status\": \"Pass\" if has_features else \"Fail\"})\n",
|
|
223
|
+
"\n",
|
|
224
|
+
"high_missing = any(c.universal_metrics.get(\"null_percentage\", 0) > 50 \n",
|
|
225
|
+
" for c in findings.columns.values())\n",
|
|
226
|
+
"checklist.append({\"Check\": \"No columns with >50% missing\", \"Status\": \"Fail\" if high_missing else \"Pass\"})\n",
|
|
227
|
+
"\n",
|
|
228
|
+
"good_quality = findings.overall_quality_score >= 70\n",
|
|
229
|
+
"checklist.append({\"Check\": \"Quality score >= 70\", \"Status\": \"Pass\" if good_quality else \"Warn\"})\n",
|
|
230
|
+
"\n",
|
|
231
|
+
"sufficient_rows = findings.row_count >= 100\n",
|
|
232
|
+
"checklist.append({\"Check\": \"Sufficient sample size (>=100)\", \"Status\": \"Pass\" if sufficient_rows else \"Fail\"})\n",
|
|
233
|
+
"\n",
|
|
234
|
+
"print(\"Modeling Readiness Checklist:\")\n",
|
|
235
|
+
"print(\"=\"*50)\n",
|
|
236
|
+
"checklist_df = pd.DataFrame(checklist)\n",
|
|
237
|
+
"display(checklist_df)"
|
|
238
|
+
]
|
|
239
|
+
},
|
|
240
|
+
{
|
|
241
|
+
"cell_type": "markdown",
|
|
242
|
+
"id": "2e7b9faa",
|
|
243
|
+
"metadata": {
|
|
244
|
+
"papermill": {
|
|
245
|
+
"duration": 0.001522,
|
|
246
|
+
"end_time": "2026-02-02T13:03:38.227106",
|
|
247
|
+
"exception": false,
|
|
248
|
+
"start_time": "2026-02-02T13:03:38.225584",
|
|
249
|
+
"status": "completed"
|
|
250
|
+
},
|
|
251
|
+
"tags": []
|
|
252
|
+
},
|
|
253
|
+
"source": [
|
|
254
|
+
"## 7.3 Class Imbalance Analysis\n",
|
|
255
|
+
"\n",
|
|
256
|
+
"**📖 Understanding Class Imbalance:**\n",
|
|
257
|
+
"- **Ratio < 3:1** - Mild imbalance, standard methods work\n",
|
|
258
|
+
"- **Ratio 3:1 to 10:1** - Moderate imbalance, use stratified sampling + class weights\n",
|
|
259
|
+
"- **Ratio > 10:1** - Severe imbalance, consider SMOTE, undersampling, or focal loss\n",
|
|
260
|
+
"\n",
|
|
261
|
+
"**⚠️ Why It Matters:**\n",
|
|
262
|
+
"- Imbalanced data causes models to predict the majority class\n",
|
|
263
|
+
"- Accuracy is misleading (80% accuracy when 80% is majority = useless model)\n",
|
|
264
|
+
"- Use F1-score, Precision, Recall, and AUC instead of accuracy"
|
|
265
|
+
]
|
|
266
|
+
},
|
|
267
|
+
{
|
|
268
|
+
"cell_type": "code",
|
|
269
|
+
"execution_count": null,
|
|
270
|
+
"id": "9b9d7edc",
|
|
271
|
+
"metadata": {
|
|
272
|
+
"execution": {
|
|
273
|
+
"iopub.execute_input": "2026-02-02T13:03:38.231909Z",
|
|
274
|
+
"iopub.status.busy": "2026-02-02T13:03:38.231790Z",
|
|
275
|
+
"iopub.status.idle": "2026-02-02T13:03:38.248222Z",
|
|
276
|
+
"shell.execute_reply": "2026-02-02T13:03:38.247617Z"
|
|
277
|
+
},
|
|
278
|
+
"papermill": {
|
|
279
|
+
"duration": 0.020094,
|
|
280
|
+
"end_time": "2026-02-02T13:03:38.248940",
|
|
281
|
+
"exception": false,
|
|
282
|
+
"start_time": "2026-02-02T13:03:38.228846",
|
|
283
|
+
"status": "completed"
|
|
284
|
+
},
|
|
285
|
+
"tags": []
|
|
286
|
+
},
|
|
287
|
+
"outputs": [],
|
|
288
|
+
"source": [
|
|
289
|
+
"if findings.target_column:\n",
|
|
290
|
+
" target = findings.target_column\n",
|
|
291
|
+
" target_series = df[target]\n",
|
|
292
|
+
" \n",
|
|
293
|
+
" print(\"=\" * 70)\n",
|
|
294
|
+
" print(\"CLASS IMBALANCE ANALYSIS\")\n",
|
|
295
|
+
" print(\"=\" * 70)\n",
|
|
296
|
+
" \n",
|
|
297
|
+
" print(f\"\\nTarget Column: {target}\")\n",
|
|
298
|
+
" print(f\"Target Type: {findings.target_type}\")\n",
|
|
299
|
+
" print(f\"Missing Values: {target_series.isnull().sum()}\")\n",
|
|
300
|
+
" \n",
|
|
301
|
+
" if findings.target_type == \"binary\":\n",
|
|
302
|
+
" value_counts = target_series.value_counts()\n",
|
|
303
|
+
" majority_class = value_counts.idxmax()\n",
|
|
304
|
+
" minority_class = value_counts.idxmin()\n",
|
|
305
|
+
" majority_count = value_counts.max()\n",
|
|
306
|
+
" minority_count = value_counts.min()\n",
|
|
307
|
+
" \n",
|
|
308
|
+
" print(f\"\\n📊 CLASS DISTRIBUTION:\")\n",
|
|
309
|
+
" print(f\" Majority Class ({majority_class}): {majority_count:,} ({majority_count/len(df)*100:.1f}%)\")\n",
|
|
310
|
+
" print(f\" Minority Class ({minority_class}): {minority_count:,} ({minority_count/len(df)*100:.1f}%)\")\n",
|
|
311
|
+
" \n",
|
|
312
|
+
" # Use framework recommender for strategy recommendations\n",
|
|
313
|
+
" recommender = ImbalanceRecommender()\n",
|
|
314
|
+
" rec = recommender.recommend(target_series, n_samples=len(df))\n",
|
|
315
|
+
" rec.print_recommendation()\n",
|
|
316
|
+
" \n",
|
|
317
|
+
" # Visualize\n",
|
|
318
|
+
" severity_colors = {\"low\": \"#2ca02c\", \"moderate\": \"#ffbb00\", \"high\": \"#ff7f0e\", \"severe\": \"#d62728\"}\n",
|
|
319
|
+
" fig = go.Figure(go.Bar(\n",
|
|
320
|
+
" x=['Churned (0)', 'Retained (1)'],\n",
|
|
321
|
+
" y=[value_counts.get(0, 0), value_counts.get(1, 0)],\n",
|
|
322
|
+
" marker_color=['#d62728', '#2ca02c'],\n",
|
|
323
|
+
" text=[f'{value_counts.get(0, 0):,}<br>({value_counts.get(0, 0)/len(df)*100:.1f}%)',\n",
|
|
324
|
+
" f'{value_counts.get(1, 0):,}<br>({value_counts.get(1, 0)/len(df)*100:.1f}%)'],\n",
|
|
325
|
+
" textposition='outside'\n",
|
|
326
|
+
" ))\n",
|
|
327
|
+
" fig.update_layout(\n",
|
|
328
|
+
" title=f'Target Class Distribution ({rec.severity.upper()} imbalance: {rec.ratio:.1f}:1)',\n",
|
|
329
|
+
" xaxis_title='Class', yaxis_title='Count',\n",
|
|
330
|
+
" template='plotly_white', height=400\n",
|
|
331
|
+
" )\n",
|
|
332
|
+
" display_figure(fig)\n",
|
|
333
|
+
" \n",
|
|
334
|
+
" # Show sklearn class weights\n",
|
|
335
|
+
" print(f\"\\n💡 SKLEARN CLASS WEIGHTS:\")\n",
|
|
336
|
+
" weight_minority = len(df) / (2 * minority_count)\n",
|
|
337
|
+
" weight_majority = len(df) / (2 * majority_count)\n",
|
|
338
|
+
" print(f\" class_weight={{0: {weight_majority:.3f}, 1: {weight_minority:.3f}}}\")\n",
|
|
339
|
+
" print(f\" Or use class_weight='balanced'\")\n",
|
|
340
|
+
" \n",
|
|
341
|
+
" # Store recommendation for later use\n",
|
|
342
|
+
" imbalance_recommendation = rec\n",
|
|
343
|
+
"else:\n",
|
|
344
|
+
" print(\"ERROR: No target column identified. Please set one in findings.\")"
|
|
345
|
+
]
|
|
346
|
+
},
|
|
347
|
+
{
|
|
348
|
+
"cell_type": "markdown",
|
|
349
|
+
"id": "9451b433",
|
|
350
|
+
"metadata": {
|
|
351
|
+
"papermill": {
|
|
352
|
+
"duration": 0.003069,
|
|
353
|
+
"end_time": "2026-02-02T13:03:38.255316",
|
|
354
|
+
"exception": false,
|
|
355
|
+
"start_time": "2026-02-02T13:03:38.252247",
|
|
356
|
+
"status": "completed"
|
|
357
|
+
},
|
|
358
|
+
"tags": []
|
|
359
|
+
},
|
|
360
|
+
"source": [
|
|
361
|
+
"## 7.4 Data Leakage Risk Assessment\n",
|
|
362
|
+
"\n",
|
|
363
|
+
"**📖 Types of Leakage:**\n",
|
|
364
|
+
"- **Target Leakage**: Feature contains information about the target that wouldn't be available at prediction time\n",
|
|
365
|
+
"- **Train-Test Leakage**: Information from test set leaks into training (e.g., scaling before split)\n",
|
|
366
|
+
"- **Temporal Leakage**: Using future information to predict past events\n",
|
|
367
|
+
"\n",
|
|
368
|
+
"**⚠️ Warning Signs:**\n",
|
|
369
|
+
"- Correlation > 0.9 with target (suspiciously predictive)\n",
|
|
370
|
+
"- Column names containing 'future', 'outcome', 'result'\n",
|
|
371
|
+
"- Date columns that come after the target determination date"
|
|
372
|
+
]
|
|
373
|
+
},
|
|
374
|
+
{
|
|
375
|
+
"cell_type": "code",
|
|
376
|
+
"execution_count": null,
|
|
377
|
+
"id": "3574ad19",
|
|
378
|
+
"metadata": {
|
|
379
|
+
"execution": {
|
|
380
|
+
"iopub.execute_input": "2026-02-02T13:03:38.262810Z",
|
|
381
|
+
"iopub.status.busy": "2026-02-02T13:03:38.262698Z",
|
|
382
|
+
"iopub.status.idle": "2026-02-02T13:03:38.279246Z",
|
|
383
|
+
"shell.execute_reply": "2026-02-02T13:03:38.278657Z"
|
|
384
|
+
},
|
|
385
|
+
"papermill": {
|
|
386
|
+
"duration": 0.021268,
|
|
387
|
+
"end_time": "2026-02-02T13:03:38.280069",
|
|
388
|
+
"exception": false,
|
|
389
|
+
"start_time": "2026-02-02T13:03:38.258801",
|
|
390
|
+
"status": "completed"
|
|
391
|
+
},
|
|
392
|
+
"tags": []
|
|
393
|
+
},
|
|
394
|
+
"outputs": [],
|
|
395
|
+
"source": [
|
|
396
|
+
"leakage_risks = []\n",
|
|
397
|
+
"\n",
|
|
398
|
+
"if findings.target_column:\n",
|
|
399
|
+
" target = findings.target_column\n",
|
|
400
|
+
" \n",
|
|
401
|
+
" for col_name, col_info in findings.columns.items():\n",
|
|
402
|
+
" if col_name == target or col_info.inferred_type == ColumnType.IDENTIFIER or col_name in TEMPORAL_METADATA_COLS:\n",
|
|
403
|
+
" continue\n",
|
|
404
|
+
" \n",
|
|
405
|
+
" if col_info.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:\n",
|
|
406
|
+
" corr = df[[col_name, target]].corr().iloc[0, 1]\n",
|
|
407
|
+
" if abs(corr) > 0.9:\n",
|
|
408
|
+
" leakage_risks.append({\n",
|
|
409
|
+
" \"Column\": col_name,\n",
|
|
410
|
+
" \"Risk\": \"High\",\n",
|
|
411
|
+
" \"Reason\": f\"Very high correlation ({corr:.3f}) - potential leakage\"\n",
|
|
412
|
+
" })\n",
|
|
413
|
+
" \n",
|
|
414
|
+
" if any(kw in col_name.lower() for kw in ['future', 'outcome', 'result', 'after']):\n",
|
|
415
|
+
" leakage_risks.append({\n",
|
|
416
|
+
" \"Column\": col_name,\n",
|
|
417
|
+
" \"Risk\": \"Medium\",\n",
|
|
418
|
+
" \"Reason\": \"Name suggests post-prediction information\"\n",
|
|
419
|
+
" })\n",
|
|
420
|
+
"\n",
|
|
421
|
+
"if leakage_risks:\n",
|
|
422
|
+
" print(\"Potential Leakage Risks:\")\n",
|
|
423
|
+
" display(pd.DataFrame(leakage_risks))\n",
|
|
424
|
+
"else:\n",
|
|
425
|
+
" print(\"No obvious leakage risks detected.\")"
|
|
426
|
+
]
|
|
427
|
+
},
|
|
428
|
+
{
|
|
429
|
+
"cell_type": "markdown",
|
|
430
|
+
"id": "4f2a31f3",
|
|
431
|
+
"metadata": {
|
|
432
|
+
"papermill": {
|
|
433
|
+
"duration": 0.003132,
|
|
434
|
+
"end_time": "2026-02-02T13:03:38.287171",
|
|
435
|
+
"exception": false,
|
|
436
|
+
"start_time": "2026-02-02T13:03:38.284039",
|
|
437
|
+
"status": "completed"
|
|
438
|
+
},
|
|
439
|
+
"tags": []
|
|
440
|
+
},
|
|
441
|
+
"source": [
|
|
442
|
+
"## 7.5 Feature Type Summary"
|
|
443
|
+
]
|
|
444
|
+
},
|
|
445
|
+
{
|
|
446
|
+
"cell_type": "code",
|
|
447
|
+
"execution_count": null,
|
|
448
|
+
"id": "defee56e",
|
|
449
|
+
"metadata": {
|
|
450
|
+
"execution": {
|
|
451
|
+
"iopub.execute_input": "2026-02-02T13:03:38.294240Z",
|
|
452
|
+
"iopub.status.busy": "2026-02-02T13:03:38.294128Z",
|
|
453
|
+
"iopub.status.idle": "2026-02-02T13:03:38.296979Z",
|
|
454
|
+
"shell.execute_reply": "2026-02-02T13:03:38.296492Z"
|
|
455
|
+
},
|
|
456
|
+
"papermill": {
|
|
457
|
+
"duration": 0.007292,
|
|
458
|
+
"end_time": "2026-02-02T13:03:38.297588",
|
|
459
|
+
"exception": false,
|
|
460
|
+
"start_time": "2026-02-02T13:03:38.290296",
|
|
461
|
+
"status": "completed"
|
|
462
|
+
},
|
|
463
|
+
"tags": []
|
|
464
|
+
},
|
|
465
|
+
"outputs": [],
|
|
466
|
+
"source": [
|
|
467
|
+
"type_summary = {}\n",
|
|
468
|
+
"for col_info in findings.columns.values():\n",
|
|
469
|
+
" col_type = col_info.inferred_type.value\n",
|
|
470
|
+
" type_summary[col_type] = type_summary.get(col_type, 0) + 1\n",
|
|
471
|
+
"\n",
|
|
472
|
+
"print(\"Feature Type Distribution:\")\n",
|
|
473
|
+
"for col_type, count in sorted(type_summary.items()):\n",
|
|
474
|
+
" print(f\" {col_type}: {count}\")\n",
|
|
475
|
+
"\n",
|
|
476
|
+
"usable_features = sum(1 for c in findings.columns.values() \n",
|
|
477
|
+
" if c.inferred_type not in [ColumnType.IDENTIFIER, ColumnType.TARGET])\n",
|
|
478
|
+
"print(f\"\\nUsable features for modeling: {usable_features}\")"
|
|
479
|
+
]
|
|
480
|
+
},
|
|
481
|
+
{
|
|
482
|
+
"cell_type": "markdown",
|
|
483
|
+
"id": "e611b0f7",
|
|
484
|
+
"metadata": {
|
|
485
|
+
"papermill": {
|
|
486
|
+
"duration": 0.04253,
|
|
487
|
+
"end_time": "2026-02-02T13:03:38.343412",
|
|
488
|
+
"exception": false,
|
|
489
|
+
"start_time": "2026-02-02T13:03:38.300882",
|
|
490
|
+
"status": "completed"
|
|
491
|
+
},
|
|
492
|
+
"tags": []
|
|
493
|
+
},
|
|
494
|
+
"source": [
|
|
495
|
+
"## 7.6 Readiness Score"
|
|
496
|
+
]
|
|
497
|
+
},
|
|
498
|
+
{
|
|
499
|
+
"cell_type": "code",
|
|
500
|
+
"execution_count": null,
|
|
501
|
+
"id": "a1ea6872",
|
|
502
|
+
"metadata": {
|
|
503
|
+
"execution": {
|
|
504
|
+
"iopub.execute_input": "2026-02-02T13:03:38.351535Z",
|
|
505
|
+
"iopub.status.busy": "2026-02-02T13:03:38.351409Z",
|
|
506
|
+
"iopub.status.idle": "2026-02-02T13:03:38.354295Z",
|
|
507
|
+
"shell.execute_reply": "2026-02-02T13:03:38.353563Z"
|
|
508
|
+
},
|
|
509
|
+
"papermill": {
|
|
510
|
+
"duration": 0.007388,
|
|
511
|
+
"end_time": "2026-02-02T13:03:38.354961",
|
|
512
|
+
"exception": false,
|
|
513
|
+
"start_time": "2026-02-02T13:03:38.347573",
|
|
514
|
+
"status": "completed"
|
|
515
|
+
},
|
|
516
|
+
"tags": []
|
|
517
|
+
},
|
|
518
|
+
"outputs": [],
|
|
519
|
+
"source": [
|
|
520
|
+
"scores = []\n",
|
|
521
|
+
"\n",
|
|
522
|
+
"scores.append(25 if has_target else 0)\n",
|
|
523
|
+
"scores.append(25 if has_features else 0)\n",
|
|
524
|
+
"scores.append(25 if not high_missing else 10)\n",
|
|
525
|
+
"scores.append(25 if good_quality else 15)\n",
|
|
526
|
+
"\n",
|
|
527
|
+
"readiness_score = sum(scores)\n",
|
|
528
|
+
"\n",
|
|
529
|
+
"print(f\"\\nModeling Readiness Score: {readiness_score}/100\")\n",
|
|
530
|
+
"\n",
|
|
531
|
+
"if readiness_score >= 90:\n",
|
|
532
|
+
" print(\"Status: READY - Proceed to modeling.\")\n",
|
|
533
|
+
"elif readiness_score >= 70:\n",
|
|
534
|
+
" print(\"Status: MOSTLY READY - Address minor issues before modeling.\")\n",
|
|
535
|
+
"elif readiness_score >= 50:\n",
|
|
536
|
+
" print(\"Status: NEEDS WORK - Significant issues to resolve.\")\n",
|
|
537
|
+
"else:\n",
|
|
538
|
+
" print(\"Status: NOT READY - Major issues must be fixed first.\")"
|
|
539
|
+
]
|
|
540
|
+
},
|
|
541
|
+
{
|
|
542
|
+
"cell_type": "markdown",
|
|
543
|
+
"id": "0e24b56c",
|
|
544
|
+
"metadata": {
|
|
545
|
+
"papermill": {
|
|
546
|
+
"duration": 0.002984,
|
|
547
|
+
"end_time": "2026-02-02T13:03:38.361310",
|
|
548
|
+
"exception": false,
|
|
549
|
+
"start_time": "2026-02-02T13:03:38.358326",
|
|
550
|
+
"status": "completed"
|
|
551
|
+
},
|
|
552
|
+
"tags": []
|
|
553
|
+
},
|
|
554
|
+
"source": [
|
|
555
|
+
"## 7.7 Feature Availability Status\n",
|
|
556
|
+
"\n",
|
|
557
|
+
"Features with tracking changes (identified in notebook 06) will be excluded from modeling."
|
|
558
|
+
]
|
|
559
|
+
},
|
|
560
|
+
{
|
|
561
|
+
"cell_type": "code",
|
|
562
|
+
"execution_count": null,
|
|
563
|
+
"id": "d70e8643",
|
|
564
|
+
"metadata": {
|
|
565
|
+
"execution": {
|
|
566
|
+
"iopub.execute_input": "2026-02-02T13:03:38.368589Z",
|
|
567
|
+
"iopub.status.busy": "2026-02-02T13:03:38.368474Z",
|
|
568
|
+
"iopub.status.idle": "2026-02-02T13:03:38.371508Z",
|
|
569
|
+
"shell.execute_reply": "2026-02-02T13:03:38.371048Z"
|
|
570
|
+
},
|
|
571
|
+
"papermill": {
|
|
572
|
+
"duration": 0.007445,
|
|
573
|
+
"end_time": "2026-02-02T13:03:38.372190",
|
|
574
|
+
"exception": false,
|
|
575
|
+
"start_time": "2026-02-02T13:03:38.364745",
|
|
576
|
+
"status": "completed"
|
|
577
|
+
},
|
|
578
|
+
"tags": []
|
|
579
|
+
},
|
|
580
|
+
"outputs": [],
|
|
581
|
+
"source": [
|
|
582
|
+
"# Feature Availability Status Report\n",
|
|
583
|
+
"print(\"=\" * 70)\n",
|
|
584
|
+
"print(\"FEATURE AVAILABILITY STATUS\")\n",
|
|
585
|
+
"print(\"=\" * 70)\n",
|
|
586
|
+
"\n",
|
|
587
|
+
"unavailable_features = findings.metadata.get(\"unavailable_features\", [])\n",
|
|
588
|
+
"if findings.has_availability_issues:\n",
|
|
589
|
+
" print(f\"\\n⚠️ {len(findings.problematic_availability_columns)} features with tracking changes:\")\n",
|
|
590
|
+
" for col in findings.problematic_availability_columns[:10]:\n",
|
|
591
|
+
" info = findings.get_feature_availability(col)\n",
|
|
592
|
+
" if info:\n",
|
|
593
|
+
" print(f\" • {col} ({info.availability_type}, {info.coverage_pct:.0f}% coverage)\")\n",
|
|
594
|
+
" if len(findings.problematic_availability_columns) > 10:\n",
|
|
595
|
+
" print(f\" ... and {len(findings.problematic_availability_columns) - 10} more\")\n",
|
|
596
|
+
" \n",
|
|
597
|
+
" action = findings.metadata.get(\"availability_action\", \"exclude\")\n",
|
|
598
|
+
" print(f\"\\n📋 Action: {action.upper()}\")\n",
|
|
599
|
+
" print(f\" These features will be excluded in notebook 08.\")\n",
|
|
600
|
+
"else:\n",
|
|
601
|
+
" print(\"\\n✅ All features have full temporal coverage.\")"
|
|
602
|
+
]
|
|
603
|
+
},
|
|
604
|
+
{
|
|
605
|
+
"cell_type": "markdown",
|
|
606
|
+
"id": "0b483740",
|
|
607
|
+
"metadata": {
|
|
608
|
+
"papermill": {
|
|
609
|
+
"duration": 0.003681,
|
|
610
|
+
"end_time": "2026-02-02T13:03:38.379463",
|
|
611
|
+
"exception": false,
|
|
612
|
+
"start_time": "2026-02-02T13:03:38.375782",
|
|
613
|
+
"status": "completed"
|
|
614
|
+
},
|
|
615
|
+
"tags": []
|
|
616
|
+
},
|
|
617
|
+
"source": [
|
|
618
|
+
"---\n",
|
|
619
|
+
"\n",
|
|
620
|
+
"## Summary: What We Learned\n",
|
|
621
|
+
"\n",
|
|
622
|
+
"In this notebook, we validated modeling readiness:\n",
|
|
623
|
+
"\n",
|
|
624
|
+
"1. **Pre-modeling Checklist** - Verified target, features, missing values, and sample size\n",
|
|
625
|
+
"2. **Class Imbalance** - Analyzed distribution and provided mitigation strategies\n",
|
|
626
|
+
"3. **Leakage Assessment** - Checked for suspicious correlations and temporal issues\n",
|
|
627
|
+
"4. **Feature Summary** - Reviewed usable features by type\n",
|
|
628
|
+
"\n",
|
|
629
|
+
"## Key Actions Before Modeling\n",
|
|
630
|
+
"\n",
|
|
631
|
+
"| Action | Priority | Implementation |\n",
|
|
632
|
+
"|--------|----------|----------------|\n",
|
|
633
|
+
"| Use stratified splits | High | `train_test_split(..., stratify=y)` |\n",
|
|
634
|
+
"| Handle imbalance | High | `class_weight='balanced'` or SMOTE |\n",
|
|
635
|
+
"| Scale features | Medium | `StandardScaler` (fit on train only!) |\n",
|
|
636
|
+
"| Encode categoricals | Medium | One-hot or target encoding |\n",
|
|
637
|
+
"\n",
|
|
638
|
+
"---\n",
|
|
639
|
+
"\n",
|
|
640
|
+
"## Next Steps\n",
|
|
641
|
+
"\n",
|
|
642
|
+
"Continue to **08_baseline_experiments.ipynb** to:\n",
|
|
643
|
+
"- Train baseline models with proper handling\n",
|
|
644
|
+
"- Compare model performance\n",
|
|
645
|
+
"- Analyze feature importance\n",
|
|
646
|
+
"- Evaluate with appropriate metrics (not just accuracy!)"
|
|
647
|
+
]
|
|
648
|
+
},
|
|
649
|
+
{
|
|
650
|
+
"cell_type": "markdown",
|
|
651
|
+
"id": "172b0b13",
|
|
652
|
+
"metadata": {
|
|
653
|
+
"papermill": {
|
|
654
|
+
"duration": 0.003265,
|
|
655
|
+
"end_time": "2026-02-02T13:03:38.385925",
|
|
656
|
+
"exception": false,
|
|
657
|
+
"start_time": "2026-02-02T13:03:38.382660",
|
|
658
|
+
"status": "completed"
|
|
659
|
+
},
|
|
660
|
+
"tags": []
|
|
661
|
+
},
|
|
662
|
+
"source": [
|
|
663
|
+
"## 7.X Final Leakage Validation\n",
|
|
664
|
+
"\n",
|
|
665
|
+
"**CRITICAL:** Run comprehensive leakage checks before model training.\n",
|
|
666
|
+
"\n",
|
|
667
|
+
"This validates:\n",
|
|
668
|
+
"- No target column in features (LD052)\n",
|
|
669
|
+
"- No target-derived columns (LD052)\n",
|
|
670
|
+
"- No domain-pattern columns with high correlation (LD053)\n",
|
|
671
|
+
"- No suspiciously perfect correlations (LD001)\n",
|
|
672
|
+
"- Temporal train/test split if applicable (LD061)"
|
|
673
|
+
]
|
|
674
|
+
},
|
|
675
|
+
{
|
|
676
|
+
"cell_type": "code",
|
|
677
|
+
"execution_count": null,
|
|
678
|
+
"id": "60fb86d5",
|
|
679
|
+
"metadata": {
|
|
680
|
+
"execution": {
|
|
681
|
+
"iopub.execute_input": "2026-02-02T13:03:38.393401Z",
|
|
682
|
+
"iopub.status.busy": "2026-02-02T13:03:38.393233Z",
|
|
683
|
+
"iopub.status.idle": "2026-02-02T13:03:38.402284Z",
|
|
684
|
+
"shell.execute_reply": "2026-02-02T13:03:38.401776Z"
|
|
685
|
+
},
|
|
686
|
+
"papermill": {
|
|
687
|
+
"duration": 0.013548,
|
|
688
|
+
"end_time": "2026-02-02T13:03:38.402821",
|
|
689
|
+
"exception": false,
|
|
690
|
+
"start_time": "2026-02-02T13:03:38.389273",
|
|
691
|
+
"status": "completed"
|
|
692
|
+
},
|
|
693
|
+
"tags": []
|
|
694
|
+
},
|
|
695
|
+
"outputs": [],
|
|
696
|
+
"source": [
|
|
697
|
+
"# Final leakage validation before model training\n",
|
|
698
|
+
"from customer_retention.analysis.diagnostics import LeakageDetector\n",
|
|
699
|
+
"\n",
|
|
700
|
+
"if 'X' in dir() and 'y' in dir():\n",
|
|
701
|
+
" detector = LeakageDetector()\n",
|
|
702
|
+
" result = detector.run_all_checks(X, y, include_pit=False)\n",
|
|
703
|
+
" \n",
|
|
704
|
+
" print(\"=\" * 70)\n",
|
|
705
|
+
" print(\"FINAL LEAKAGE VALIDATION\")\n",
|
|
706
|
+
" print(\"=\" * 70)\n",
|
|
707
|
+
" \n",
|
|
708
|
+
" if result.passed:\n",
|
|
709
|
+
" print(\"\\n✅ PASSED: No critical leakage issues\")\n",
|
|
710
|
+
" print(f\" Checks run: {len(result.checks)}\")\n",
|
|
711
|
+
" \n",
|
|
712
|
+
" # Show warnings for HIGH severity issues\n",
|
|
713
|
+
" high_issues = [c for c in result.checks if c.severity.value == 'high']\n",
|
|
714
|
+
" if high_issues:\n",
|
|
715
|
+
" print(f\"\\n⚠️ {len(high_issues)} HIGH severity warnings (review recommended):\")\n",
|
|
716
|
+
" for issue in high_issues[:3]:\n",
|
|
717
|
+
" print(f\" • {issue.feature}: {issue.recommendation[:80]}...\")\n",
|
|
718
|
+
" else:\n",
|
|
719
|
+
" print(\"\\n❌ CRITICAL LEAKAGE DETECTED - DO NOT TRAIN MODEL\")\n",
|
|
720
|
+
" for issue in result.critical_issues:\n",
|
|
721
|
+
" print(f\"\\n [{issue.check_id}] {issue.feature}\")\n",
|
|
722
|
+
" print(f\" {issue.recommendation}\")\n",
|
|
723
|
+
" raise ValueError(f\"Cannot proceed: {len(result.critical_issues)} critical leakage issues\")\n",
|
|
724
|
+
"else:\n",
|
|
725
|
+
" print(\"X and y not yet defined - run feature preparation first\")"
|
|
726
|
+
]
|
|
727
|
+
},
|
|
728
|
+
{
|
|
729
|
+
"cell_type": "markdown",
|
|
730
|
+
"id": "5cb6819c",
|
|
731
|
+
"metadata": {
|
|
732
|
+
"papermill": {
|
|
733
|
+
"duration": 0.002913,
|
|
734
|
+
"end_time": "2026-02-02T13:03:38.409033",
|
|
735
|
+
"exception": false,
|
|
736
|
+
"start_time": "2026-02-02T13:03:38.406120",
|
|
737
|
+
"status": "completed"
|
|
738
|
+
},
|
|
739
|
+
"tags": []
|
|
740
|
+
},
|
|
741
|
+
"source": [
|
|
742
|
+
"> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
|
|
743
|
+
"> The next notebook will automatically export this notebook's HTML documentation from the saved file."
|
|
744
|
+
]
|
|
745
|
+
}
|
|
746
|
+
],
|
|
747
|
+
"metadata": {
|
|
748
|
+
"kernelspec": {
|
|
749
|
+
"display_name": "Python 3",
|
|
750
|
+
"language": "python",
|
|
751
|
+
"name": "python3"
|
|
752
|
+
},
|
|
753
|
+
"language_info": {
|
|
754
|
+
"codemirror_mode": {
|
|
755
|
+
"name": "ipython",
|
|
756
|
+
"version": 3
|
|
757
|
+
},
|
|
758
|
+
"file_extension": ".py",
|
|
759
|
+
"mimetype": "text/x-python",
|
|
760
|
+
"name": "python",
|
|
761
|
+
"nbconvert_exporter": "python",
|
|
762
|
+
"pygments_lexer": "ipython3",
|
|
763
|
+
"version": "3.12.4"
|
|
764
|
+
},
|
|
765
|
+
"papermill": {
|
|
766
|
+
"default_parameters": {},
|
|
767
|
+
"duration": 5.906739,
|
|
768
|
+
"end_time": "2026-02-02T13:03:41.029412",
|
|
769
|
+
"environment_variables": {},
|
|
770
|
+
"exception": null,
|
|
771
|
+
"input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/07_modeling_readiness.ipynb",
|
|
772
|
+
"output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/07_modeling_readiness.ipynb",
|
|
773
|
+
"parameters": {},
|
|
774
|
+
"start_time": "2026-02-02T13:03:35.122673",
|
|
775
|
+
"version": "2.6.0"
|
|
776
|
+
}
|
|
777
|
+
},
|
|
778
|
+
"nbformat": 4,
|
|
779
|
+
"nbformat_minor": 5
|
|
780
|
+
}
|