churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb
ADDED
|
@@ -0,0 +1,1890 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "1356bd5c",
|
|
6
|
+
"metadata": {
|
|
7
|
+
"papermill": {
|
|
8
|
+
"duration": 0.003424,
|
|
9
|
+
"end_time": "2026-02-02T13:03:14.230161",
|
|
10
|
+
"exception": false,
|
|
11
|
+
"start_time": "2026-02-02T13:03:14.226737",
|
|
12
|
+
"status": "completed"
|
|
13
|
+
},
|
|
14
|
+
"tags": []
|
|
15
|
+
},
|
|
16
|
+
"source": [
|
|
17
|
+
"# Chapter 4: Relationship Analysis\n",
|
|
18
|
+
"\n",
|
|
19
|
+
"**Purpose:** Explore feature correlations, relationships with the target, and identify predictive signals.\n",
|
|
20
|
+
"\n",
|
|
21
|
+
"**What you'll learn:**\n",
|
|
22
|
+
"- How to interpret correlation matrices and identify multicollinearity\n",
|
|
23
|
+
"- How to visualize feature distributions by target class\n",
|
|
24
|
+
"- How to identify which features have the strongest relationship with retention\n",
|
|
25
|
+
"- How to analyze categorical features for predictive power\n",
|
|
26
|
+
"\n",
|
|
27
|
+
"**Outputs:**\n",
|
|
28
|
+
"- Correlation heatmap with multicollinearity detection\n",
|
|
29
|
+
"- Feature distributions by retention status (box plots)\n",
|
|
30
|
+
"- Retention rates by categorical features\n",
|
|
31
|
+
"- Feature-target correlation rankings\n",
|
|
32
|
+
"\n",
|
|
33
|
+
"---\n",
|
|
34
|
+
"\n",
|
|
35
|
+
"## Understanding Feature Relationships\n",
|
|
36
|
+
"\n",
|
|
37
|
+
"| Analysis | What It Tells You | Action |\n",
|
|
38
|
+
"|----------|------------------|--------|\n",
|
|
39
|
+
"| **High Correlation** (r > 0.7) | Features carry redundant information | Consider removing one |\n",
|
|
40
|
+
"| **Target Correlation** | Feature's predictive power | Prioritize high-correlation features |\n",
|
|
41
|
+
"| **Class Separation** | How different retained vs churned look | Good separation = good predictor |\n",
|
|
42
|
+
"| **Categorical Rates** | Retention varies by category | Use for segmentation and encoding |"
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"cell_type": "markdown",
|
|
47
|
+
"id": "49fb8d17",
|
|
48
|
+
"metadata": {
|
|
49
|
+
"papermill": {
|
|
50
|
+
"duration": 0.002426,
|
|
51
|
+
"end_time": "2026-02-02T13:03:14.235498",
|
|
52
|
+
"exception": false,
|
|
53
|
+
"start_time": "2026-02-02T13:03:14.233072",
|
|
54
|
+
"status": "completed"
|
|
55
|
+
},
|
|
56
|
+
"tags": []
|
|
57
|
+
},
|
|
58
|
+
"source": [
|
|
59
|
+
"## 4.1 Setup"
|
|
60
|
+
]
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"cell_type": "code",
|
|
64
|
+
"execution_count": null,
|
|
65
|
+
"id": "dfe28e7e",
|
|
66
|
+
"metadata": {
|
|
67
|
+
"execution": {
|
|
68
|
+
"iopub.execute_input": "2026-02-02T13:03:14.242119Z",
|
|
69
|
+
"iopub.status.busy": "2026-02-02T13:03:14.241998Z",
|
|
70
|
+
"iopub.status.idle": "2026-02-02T13:03:16.097606Z",
|
|
71
|
+
"shell.execute_reply": "2026-02-02T13:03:16.096428Z"
|
|
72
|
+
},
|
|
73
|
+
"papermill": {
|
|
74
|
+
"duration": 1.860233,
|
|
75
|
+
"end_time": "2026-02-02T13:03:16.098514",
|
|
76
|
+
"exception": false,
|
|
77
|
+
"start_time": "2026-02-02T13:03:14.238281",
|
|
78
|
+
"status": "completed"
|
|
79
|
+
},
|
|
80
|
+
"tags": []
|
|
81
|
+
},
|
|
82
|
+
"outputs": [],
|
|
83
|
+
"source": [
|
|
84
|
+
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
85
|
+
"track_and_export_previous(\"04_relationship_analysis.ipynb\")\n",
|
|
86
|
+
"\n",
|
|
87
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationRegistry\n",
|
|
88
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
89
|
+
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
90
|
+
"from customer_retention.stages.profiling import (\n",
|
|
91
|
+
" RelationshipRecommender, RecommendationCategory\n",
|
|
92
|
+
")\n",
|
|
93
|
+
"import yaml\n",
|
|
94
|
+
"import pandas as pd\n",
|
|
95
|
+
"import numpy as np\n",
|
|
96
|
+
"import plotly.graph_objects as go\n",
|
|
97
|
+
"import plotly.express as px\n",
|
|
98
|
+
"from plotly.subplots import make_subplots\n",
|
|
99
|
+
"from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
|
|
100
|
+
]
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
"cell_type": "code",
|
|
104
|
+
"execution_count": null,
|
|
105
|
+
"id": "c7bcf647",
|
|
106
|
+
"metadata": {
|
|
107
|
+
"execution": {
|
|
108
|
+
"iopub.execute_input": "2026-02-02T13:03:16.105397Z",
|
|
109
|
+
"iopub.status.busy": "2026-02-02T13:03:16.105257Z",
|
|
110
|
+
"iopub.status.idle": "2026-02-02T13:03:16.457751Z",
|
|
111
|
+
"shell.execute_reply": "2026-02-02T13:03:16.457003Z"
|
|
112
|
+
},
|
|
113
|
+
"papermill": {
|
|
114
|
+
"duration": 0.356824,
|
|
115
|
+
"end_time": "2026-02-02T13:03:16.458495",
|
|
116
|
+
"exception": false,
|
|
117
|
+
"start_time": "2026-02-02T13:03:16.101671",
|
|
118
|
+
"status": "completed"
|
|
119
|
+
},
|
|
120
|
+
"tags": []
|
|
121
|
+
},
|
|
122
|
+
"outputs": [],
|
|
123
|
+
"source": [
|
|
124
|
+
"# === CONFIGURATION ===\n",
|
|
125
|
+
"# Option 1: Set the exact path from notebook 01 output\n",
|
|
126
|
+
"# FINDINGS_PATH = \"../experiments/findings/customer_retention_retail_abc123_findings.yaml\"\n",
|
|
127
|
+
"\n",
|
|
128
|
+
"# Option 2: Auto-discover the most recent findings file\n",
|
|
129
|
+
"from pathlib import Path\n",
|
|
130
|
+
"\n",
|
|
131
|
+
"# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
|
|
132
|
+
"\n",
|
|
133
|
+
"findings_files = [f for f in FINDINGS_DIR.glob(\"*_findings.yaml\") if \"multi_dataset\" not in f.name]\n",
|
|
134
|
+
"if not findings_files:\n",
|
|
135
|
+
" raise FileNotFoundError(f\"No findings files found in {FINDINGS_DIR}. Run notebook 01 first.\")\n",
|
|
136
|
+
"\n",
|
|
137
|
+
"# Prefer aggregated findings (from 01d) over event-level findings\n",
|
|
138
|
+
"# Pattern: *_aggregated* in filename indicates aggregated data\n",
|
|
139
|
+
"aggregated_files = [f for f in findings_files if \"_aggregated\" in f.name]\n",
|
|
140
|
+
"non_aggregated_files = [f for f in findings_files if \"_aggregated\" not in f.name]\n",
|
|
141
|
+
"\n",
|
|
142
|
+
"if aggregated_files:\n",
|
|
143
|
+
" # Use most recent aggregated file\n",
|
|
144
|
+
" aggregated_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
145
|
+
" FINDINGS_PATH = str(aggregated_files[0])\n",
|
|
146
|
+
" print(f\"Found {len(aggregated_files)} aggregated findings file(s)\")\n",
|
|
147
|
+
" print(f\"Using: {FINDINGS_PATH}\")\n",
|
|
148
|
+
" if non_aggregated_files:\n",
|
|
149
|
+
" print(f\" (Skipping {len(non_aggregated_files)} event-level findings)\")\n",
|
|
150
|
+
"else:\n",
|
|
151
|
+
" # Fall back to most recent non-aggregated file\n",
|
|
152
|
+
" non_aggregated_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
153
|
+
" FINDINGS_PATH = str(non_aggregated_files[0])\n",
|
|
154
|
+
" print(f\"Found {len(findings_files)} findings file(s)\")\n",
|
|
155
|
+
" print(f\"Using: {FINDINGS_PATH}\")\n",
|
|
156
|
+
"\n",
|
|
157
|
+
"RECOMMENDATIONS_PATH = FINDINGS_PATH.replace(\"_findings.yaml\", \"_recommendations.yaml\")\n",
|
|
158
|
+
"\n",
|
|
159
|
+
"findings = ExplorationFindings.load(FINDINGS_PATH)\n",
|
|
160
|
+
"\n",
|
|
161
|
+
"# Load data - handle aggregated vs standard paths\n",
|
|
162
|
+
"from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
|
|
163
|
+
"\n",
|
|
164
|
+
"# For aggregated data, load directly from the parquet source\n",
|
|
165
|
+
"if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
|
|
166
|
+
" source_path = Path(findings.source_path)\n",
|
|
167
|
+
" # Handle relative path from notebook directory\n",
|
|
168
|
+
" if not source_path.is_absolute():\n",
|
|
169
|
+
" # The source_path in findings is relative to project root\n",
|
|
170
|
+
" if str(source_path).startswith(\"experiments\"):\n",
|
|
171
|
+
" source_path = Path(\"..\") / source_path\n",
|
|
172
|
+
" else:\n",
|
|
173
|
+
" source_path = FINDINGS_DIR / source_path.name\n",
|
|
174
|
+
" df = pd.read_parquet(source_path)\n",
|
|
175
|
+
" data_source = f\"aggregated:{source_path.name}\"\n",
|
|
176
|
+
"else:\n",
|
|
177
|
+
" # Standard loading for event-level or entity-level data\n",
|
|
178
|
+
" df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
|
|
179
|
+
"\n",
|
|
180
|
+
"charts = ChartBuilder()\n",
|
|
181
|
+
"\n",
|
|
182
|
+
"if Path(RECOMMENDATIONS_PATH).exists():\n",
|
|
183
|
+
" with open(RECOMMENDATIONS_PATH, \"r\") as f:\n",
|
|
184
|
+
" registry = RecommendationRegistry.from_dict(yaml.safe_load(f))\n",
|
|
185
|
+
" print(f\"Loaded existing recommendations: {len(registry.all_recommendations)} total\")\n",
|
|
186
|
+
"else:\n",
|
|
187
|
+
" registry = RecommendationRegistry()\n",
|
|
188
|
+
" registry.init_bronze(findings.source_path)\n",
|
|
189
|
+
" registry.init_silver(findings.entity_column or \"entity_id\")\n",
|
|
190
|
+
" registry.init_gold(findings.target_column or \"target\")\n",
|
|
191
|
+
" print(\"Initialized new recommendation registry\")\n",
|
|
192
|
+
"\n",
|
|
193
|
+
"print(f\"\\nLoaded {len(df):,} rows from: {data_source}\")"
|
|
194
|
+
]
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
"cell_type": "markdown",
|
|
198
|
+
"id": "f177a41d",
|
|
199
|
+
"metadata": {
|
|
200
|
+
"papermill": {
|
|
201
|
+
"duration": 0.012025,
|
|
202
|
+
"end_time": "2026-02-02T13:03:16.475847",
|
|
203
|
+
"exception": false,
|
|
204
|
+
"start_time": "2026-02-02T13:03:16.463822",
|
|
205
|
+
"status": "completed"
|
|
206
|
+
},
|
|
207
|
+
"tags": []
|
|
208
|
+
},
|
|
209
|
+
"source": [
|
|
210
|
+
"## 4.2 Numeric Correlation Matrix\n",
|
|
211
|
+
"\n",
|
|
212
|
+
"**📖 How to Read the Heatmap:**\n",
|
|
213
|
+
"- **Red (+1)**: Perfect positive correlation - features move together\n",
|
|
214
|
+
"- **Blue (-1)**: Perfect negative correlation - features move opposite\n",
|
|
215
|
+
"- **White (0)**: No linear relationship\n",
|
|
216
|
+
"\n",
|
|
217
|
+
"**⚠️ Multicollinearity Warning:**\n",
|
|
218
|
+
"- Pairs with |r| > 0.7 may cause issues in linear models\n",
|
|
219
|
+
"- Consider removing one feature from highly correlated pairs\n",
|
|
220
|
+
"- Tree-based models are more robust to multicollinearity"
|
|
221
|
+
]
|
|
222
|
+
},
|
|
223
|
+
{
|
|
224
|
+
"cell_type": "code",
|
|
225
|
+
"execution_count": null,
|
|
226
|
+
"id": "138a85ed",
|
|
227
|
+
"metadata": {
|
|
228
|
+
"execution": {
|
|
229
|
+
"iopub.execute_input": "2026-02-02T13:03:16.487009Z",
|
|
230
|
+
"iopub.status.busy": "2026-02-02T13:03:16.486891Z",
|
|
231
|
+
"iopub.status.idle": "2026-02-02T13:03:16.531590Z",
|
|
232
|
+
"shell.execute_reply": "2026-02-02T13:03:16.531151Z"
|
|
233
|
+
},
|
|
234
|
+
"papermill": {
|
|
235
|
+
"duration": 0.050299,
|
|
236
|
+
"end_time": "2026-02-02T13:03:16.532166",
|
|
237
|
+
"exception": false,
|
|
238
|
+
"start_time": "2026-02-02T13:03:16.481867",
|
|
239
|
+
"status": "completed"
|
|
240
|
+
},
|
|
241
|
+
"tags": []
|
|
242
|
+
},
|
|
243
|
+
"outputs": [],
|
|
244
|
+
"source": [
|
|
245
|
+
"numeric_cols = [\n",
|
|
246
|
+
" name for name, col in findings.columns.items()\n",
|
|
247
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE, ColumnType.TARGET]\n",
|
|
248
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
249
|
+
"]\n",
|
|
250
|
+
"\n",
|
|
251
|
+
"if len(numeric_cols) >= 2:\n",
|
|
252
|
+
" corr_matrix = df[numeric_cols].corr()\n",
|
|
253
|
+
" fig = charts.heatmap(\n",
|
|
254
|
+
" corr_matrix.values,\n",
|
|
255
|
+
" x_labels=numeric_cols,\n",
|
|
256
|
+
" y_labels=numeric_cols,\n",
|
|
257
|
+
" title=\"Numeric Correlation Matrix\"\n",
|
|
258
|
+
" )\n",
|
|
259
|
+
" display_figure(fig)\n",
|
|
260
|
+
"else:\n",
|
|
261
|
+
" print(\"Not enough numeric columns for correlation analysis.\")"
|
|
262
|
+
]
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
"cell_type": "markdown",
|
|
266
|
+
"id": "67f09eb5",
|
|
267
|
+
"metadata": {
|
|
268
|
+
"papermill": {
|
|
269
|
+
"duration": 0.004406,
|
|
270
|
+
"end_time": "2026-02-02T13:03:16.541516",
|
|
271
|
+
"exception": false,
|
|
272
|
+
"start_time": "2026-02-02T13:03:16.537110",
|
|
273
|
+
"status": "completed"
|
|
274
|
+
},
|
|
275
|
+
"tags": []
|
|
276
|
+
},
|
|
277
|
+
"source": [
|
|
278
|
+
"## 4.3 High Correlation Pairs"
|
|
279
|
+
]
|
|
280
|
+
},
|
|
281
|
+
{
|
|
282
|
+
"cell_type": "code",
|
|
283
|
+
"execution_count": null,
|
|
284
|
+
"id": "eec4069c",
|
|
285
|
+
"metadata": {
|
|
286
|
+
"execution": {
|
|
287
|
+
"iopub.execute_input": "2026-02-02T13:03:16.551293Z",
|
|
288
|
+
"iopub.status.busy": "2026-02-02T13:03:16.551148Z",
|
|
289
|
+
"iopub.status.idle": "2026-02-02T13:03:16.600981Z",
|
|
290
|
+
"shell.execute_reply": "2026-02-02T13:03:16.600205Z"
|
|
291
|
+
},
|
|
292
|
+
"papermill": {
|
|
293
|
+
"duration": 0.056391,
|
|
294
|
+
"end_time": "2026-02-02T13:03:16.602593",
|
|
295
|
+
"exception": false,
|
|
296
|
+
"start_time": "2026-02-02T13:03:16.546202",
|
|
297
|
+
"status": "completed"
|
|
298
|
+
},
|
|
299
|
+
"tags": []
|
|
300
|
+
},
|
|
301
|
+
"outputs": [],
|
|
302
|
+
"source": [
|
|
303
|
+
"high_corr_threshold = 0.7\n",
|
|
304
|
+
"high_corr_pairs = []\n",
|
|
305
|
+
"\n",
|
|
306
|
+
"if len(numeric_cols) >= 2:\n",
|
|
307
|
+
" corr_matrix = df[numeric_cols].corr()\n",
|
|
308
|
+
" for i in range(len(numeric_cols)):\n",
|
|
309
|
+
" for j in range(i+1, len(numeric_cols)):\n",
|
|
310
|
+
" corr_val = corr_matrix.iloc[i, j]\n",
|
|
311
|
+
" if abs(corr_val) >= high_corr_threshold:\n",
|
|
312
|
+
" high_corr_pairs.append({\n",
|
|
313
|
+
" \"Column 1\": numeric_cols[i],\n",
|
|
314
|
+
" \"Column 2\": numeric_cols[j],\n",
|
|
315
|
+
" \"Correlation\": f\"{corr_val:.3f}\"\n",
|
|
316
|
+
" })\n",
|
|
317
|
+
"\n",
|
|
318
|
+
"if high_corr_pairs:\n",
|
|
319
|
+
" print(f\"High Correlation Pairs (|r| >= {high_corr_threshold}):\")\n",
|
|
320
|
+
" display(pd.DataFrame(high_corr_pairs))\n",
|
|
321
|
+
" print(\"\\nConsider removing one of each pair to reduce multicollinearity.\")\n",
|
|
322
|
+
"else:\n",
|
|
323
|
+
" print(\"No high correlation pairs detected.\")"
|
|
324
|
+
]
|
|
325
|
+
},
|
|
326
|
+
{
|
|
327
|
+
"cell_type": "markdown",
|
|
328
|
+
"id": "3c8860b5",
|
|
329
|
+
"metadata": {
|
|
330
|
+
"papermill": {
|
|
331
|
+
"duration": 0.005993,
|
|
332
|
+
"end_time": "2026-02-02T13:03:16.613757",
|
|
333
|
+
"exception": false,
|
|
334
|
+
"start_time": "2026-02-02T13:03:16.607764",
|
|
335
|
+
"status": "completed"
|
|
336
|
+
},
|
|
337
|
+
"tags": []
|
|
338
|
+
},
|
|
339
|
+
"source": [
|
|
340
|
+
"## 4.4 Feature Distributions by Retention Status\n",
|
|
341
|
+
"\n",
|
|
342
|
+
"**📖 How to Interpret Box Plots:**\n",
|
|
343
|
+
"- **Box** = Middle 50% of data (IQR)\n",
|
|
344
|
+
"- **Line inside box** = Median\n",
|
|
345
|
+
"- **Whiskers** = 1.5 × IQR from box edges\n",
|
|
346
|
+
"- **Points outside** = Outliers\n",
|
|
347
|
+
"\n",
|
|
348
|
+
"**⚠️ What Makes a Good Predictor:**\n",
|
|
349
|
+
"- **Clear separation** between retained (green) and churned (red) boxes\n",
|
|
350
|
+
"- **Different medians** = Feature values differ between classes\n",
|
|
351
|
+
"- **Minimal overlap** = Easier to distinguish classes"
|
|
352
|
+
]
|
|
353
|
+
},
|
|
354
|
+
{
|
|
355
|
+
"cell_type": "code",
|
|
356
|
+
"execution_count": null,
|
|
357
|
+
"id": "d65804ff",
|
|
358
|
+
"metadata": {
|
|
359
|
+
"execution": {
|
|
360
|
+
"iopub.execute_input": "2026-02-02T13:03:16.625086Z",
|
|
361
|
+
"iopub.status.busy": "2026-02-02T13:03:16.624968Z",
|
|
362
|
+
"iopub.status.idle": "2026-02-02T13:03:16.741429Z",
|
|
363
|
+
"shell.execute_reply": "2026-02-02T13:03:16.741097Z"
|
|
364
|
+
},
|
|
365
|
+
"papermill": {
|
|
366
|
+
"duration": 0.12323,
|
|
367
|
+
"end_time": "2026-02-02T13:03:16.742085",
|
|
368
|
+
"exception": false,
|
|
369
|
+
"start_time": "2026-02-02T13:03:16.618855",
|
|
370
|
+
"status": "completed"
|
|
371
|
+
},
|
|
372
|
+
"tags": []
|
|
373
|
+
},
|
|
374
|
+
"outputs": [],
|
|
375
|
+
"source": [
|
|
376
|
+
"# Feature Distributions by Retention Status\n",
|
|
377
|
+
"if findings.target_column and findings.target_column in df.columns:\n",
|
|
378
|
+
" target = findings.target_column\n",
|
|
379
|
+
" \n",
|
|
380
|
+
" feature_cols = [\n",
|
|
381
|
+
" name for name, col in findings.columns.items()\n",
|
|
382
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
383
|
+
" and name != target\n",
|
|
384
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
385
|
+
" ]\n",
|
|
386
|
+
" \n",
|
|
387
|
+
" if feature_cols:\n",
|
|
388
|
+
" print(\"=\" * 80)\n",
|
|
389
|
+
" print(f\"FEATURE DISTRIBUTIONS BY TARGET: {target}\")\n",
|
|
390
|
+
" print(\"=\" * 80)\n",
|
|
391
|
+
" \n",
|
|
392
|
+
" # Calculate summary statistics by target\n",
|
|
393
|
+
" summary_by_target = []\n",
|
|
394
|
+
" for col in feature_cols:\n",
|
|
395
|
+
" for target_val, label in [(0, \"Churned\"), (1, \"Retained\")]:\n",
|
|
396
|
+
" subset = df[df[target] == target_val][col].dropna()\n",
|
|
397
|
+
" if len(subset) > 0:\n",
|
|
398
|
+
" summary_by_target.append({\n",
|
|
399
|
+
" \"Feature\": col,\n",
|
|
400
|
+
" \"Group\": label,\n",
|
|
401
|
+
" \"Count\": len(subset),\n",
|
|
402
|
+
" \"Mean\": subset.mean(),\n",
|
|
403
|
+
" \"Median\": subset.median(),\n",
|
|
404
|
+
" \"Std\": subset.std()\n",
|
|
405
|
+
" })\n",
|
|
406
|
+
" \n",
|
|
407
|
+
" if summary_by_target:\n",
|
|
408
|
+
" summary_df = pd.DataFrame(summary_by_target)\n",
|
|
409
|
+
" \n",
|
|
410
|
+
" # Display summary table\n",
|
|
411
|
+
" print(\"\\n📊 Summary Statistics by Retention Status:\")\n",
|
|
412
|
+
" display_summary = summary_df.pivot(index=\"Feature\", columns=\"Group\", values=[\"Mean\", \"Median\"])\n",
|
|
413
|
+
" display_summary.columns = [f\"{stat} ({group})\" for stat, group in display_summary.columns]\n",
|
|
414
|
+
" display(display_summary.round(3))\n",
|
|
415
|
+
" \n",
|
|
416
|
+
" # Calculate effect size (Cohen's d) for each feature\n",
|
|
417
|
+
" print(\"\\n📈 Feature Importance Indicators (Effect Size - Cohen's d):\")\n",
|
|
418
|
+
" print(\"-\" * 70)\n",
|
|
419
|
+
" effect_sizes = []\n",
|
|
420
|
+
" for col in feature_cols:\n",
|
|
421
|
+
" churned = df[df[target] == 0][col].dropna()\n",
|
|
422
|
+
" retained = df[df[target] == 1][col].dropna()\n",
|
|
423
|
+
" \n",
|
|
424
|
+
" if len(churned) > 0 and len(retained) > 0:\n",
|
|
425
|
+
" # Cohen's d\n",
|
|
426
|
+
" pooled_std = np.sqrt(((len(churned)-1)*churned.std()**2 + (len(retained)-1)*retained.std()**2) / \n",
|
|
427
|
+
" (len(churned) + len(retained) - 2))\n",
|
|
428
|
+
" if pooled_std > 0:\n",
|
|
429
|
+
" d = (retained.mean() - churned.mean()) / pooled_std\n",
|
|
430
|
+
" else:\n",
|
|
431
|
+
" d = 0\n",
|
|
432
|
+
" \n",
|
|
433
|
+
" # Interpret effect size\n",
|
|
434
|
+
" abs_d = abs(d)\n",
|
|
435
|
+
" if abs_d >= 0.8:\n",
|
|
436
|
+
" interpretation = \"Large effect\"\n",
|
|
437
|
+
" emoji = \"🔴\"\n",
|
|
438
|
+
" elif abs_d >= 0.5:\n",
|
|
439
|
+
" interpretation = \"Medium effect\"\n",
|
|
440
|
+
" emoji = \"🟡\"\n",
|
|
441
|
+
" elif abs_d >= 0.2:\n",
|
|
442
|
+
" interpretation = \"Small effect\"\n",
|
|
443
|
+
" emoji = \"🟢\"\n",
|
|
444
|
+
" else:\n",
|
|
445
|
+
" interpretation = \"Negligible\"\n",
|
|
446
|
+
" emoji = \"⚪\"\n",
|
|
447
|
+
" \n",
|
|
448
|
+
" effect_sizes.append({\n",
|
|
449
|
+
" \"feature\": col,\n",
|
|
450
|
+
" \"cohens_d\": d,\n",
|
|
451
|
+
" \"abs_d\": abs_d,\n",
|
|
452
|
+
" \"interpretation\": interpretation\n",
|
|
453
|
+
" })\n",
|
|
454
|
+
" \n",
|
|
455
|
+
" direction = \"↑ Higher in retained\" if d > 0 else \"↓ Lower in retained\"\n",
|
|
456
|
+
" print(f\" {emoji} {col}: d={d:+.3f} ({interpretation}) {direction}\")\n",
|
|
457
|
+
" \n",
|
|
458
|
+
" # Sort by effect size for identifying important features\n",
|
|
459
|
+
" if effect_sizes:\n",
|
|
460
|
+
" effect_df = pd.DataFrame(effect_sizes).sort_values(\"abs_d\", ascending=False)\n",
|
|
461
|
+
" important_features = effect_df[effect_df[\"abs_d\"] >= 0.2][\"feature\"].tolist()\n",
|
|
462
|
+
" if important_features:\n",
|
|
463
|
+
" print(f\"\\n⭐ Features with notable effect (|d| ≥ 0.2): {', '.join(important_features)}\")\n",
|
|
464
|
+
" else:\n",
|
|
465
|
+
" print(\" No effect sizes could be calculated (insufficient data in one or both groups)\")\n",
|
|
466
|
+
" else:\n",
|
|
467
|
+
" print(\"No numeric feature columns found for distribution analysis.\")\n",
|
|
468
|
+
"else:\n",
|
|
469
|
+
" print(\"Target column not available.\")"
|
|
470
|
+
]
|
|
471
|
+
},
|
|
472
|
+
{
|
|
473
|
+
"cell_type": "markdown",
|
|
474
|
+
"id": "d6430c2f",
|
|
475
|
+
"metadata": {
|
|
476
|
+
"papermill": {
|
|
477
|
+
"duration": 0.004505,
|
|
478
|
+
"end_time": "2026-02-02T13:03:16.751786",
|
|
479
|
+
"exception": false,
|
|
480
|
+
"start_time": "2026-02-02T13:03:16.747281",
|
|
481
|
+
"status": "completed"
|
|
482
|
+
},
|
|
483
|
+
"tags": []
|
|
484
|
+
},
|
|
485
|
+
"source": [
|
|
486
|
+
"### Interpreting Effect Sizes (Cohen's d)\n",
|
|
487
|
+
"\n",
|
|
488
|
+
"| Effect Size | Interpretation | What It Means for Modeling |\n",
|
|
489
|
+
"|-------------|----------------|---------------------------|\n",
|
|
490
|
+
"| \\|d\\| ≥ 0.8 | Large | Strong discriminator - prioritize this feature |\n",
|
|
491
|
+
"| \\|d\\| = 0.5-0.8 | Medium | Useful predictor - include in model |\n",
|
|
492
|
+
"| \\|d\\| = 0.2-0.5 | Small | Weak but may help in combination with others |\n",
|
|
493
|
+
"| \\|d\\| < 0.2 | Negligible | Limited predictive value alone |\n",
|
|
494
|
+
"\n",
|
|
495
|
+
"**🎯 Actionable Insights:**\n",
|
|
496
|
+
"- **Features with large effects** are your best predictors - ensure they're included in your model\n",
|
|
497
|
+
"- **Direction matters**: \"Higher in retained\" means customers with high values tend to stay; use this for threshold-based business rules\n",
|
|
498
|
+
"- **Features with small/negligible effects** may still be useful in combination or as interaction terms\n",
|
|
499
|
+
"\n",
|
|
500
|
+
"**⚠️ Cautions:**\n",
|
|
501
|
+
"- Effect size assumes roughly normal distributions - check skewness in notebook 03\n",
|
|
502
|
+
"- Large effects could be due to confounding variables - validate with domain knowledge\n",
|
|
503
|
+
"- Correlation ≠ causation: high engagement may not *cause* retention\n",
|
|
504
|
+
"\n",
|
|
505
|
+
"### Box Plot Visualization\n",
|
|
506
|
+
"\n",
|
|
507
|
+
"**📈 How to Read the Box Plots Below:**\n",
|
|
508
|
+
"- **Well-separated boxes** (little/no overlap) → Feature clearly distinguishes retained vs churned\n",
|
|
509
|
+
"- **Different medians** (center lines at different heights) → Groups have different typical values\n",
|
|
510
|
+
"- **Many outliers in one group** → May indicate subpopulations worth investigating"
|
|
511
|
+
]
|
|
512
|
+
},
|
|
513
|
+
{
|
|
514
|
+
"cell_type": "code",
|
|
515
|
+
"execution_count": null,
|
|
516
|
+
"id": "bb7c3c73",
|
|
517
|
+
"metadata": {
|
|
518
|
+
"execution": {
|
|
519
|
+
"iopub.execute_input": "2026-02-02T13:03:16.763518Z",
|
|
520
|
+
"iopub.status.busy": "2026-02-02T13:03:16.763378Z",
|
|
521
|
+
"iopub.status.idle": "2026-02-02T13:03:16.813650Z",
|
|
522
|
+
"shell.execute_reply": "2026-02-02T13:03:16.812769Z"
|
|
523
|
+
},
|
|
524
|
+
"papermill": {
|
|
525
|
+
"duration": 0.056916,
|
|
526
|
+
"end_time": "2026-02-02T13:03:16.814355",
|
|
527
|
+
"exception": false,
|
|
528
|
+
"start_time": "2026-02-02T13:03:16.757439",
|
|
529
|
+
"status": "completed"
|
|
530
|
+
},
|
|
531
|
+
"tags": []
|
|
532
|
+
},
|
|
533
|
+
"outputs": [],
|
|
534
|
+
"source": [
|
|
535
|
+
"# Box Plots: Visual comparison of distributions\n",
|
|
536
|
+
"if findings.target_column and findings.target_column in df.columns:\n",
|
|
537
|
+
" target = findings.target_column\n",
|
|
538
|
+
" \n",
|
|
539
|
+
" feature_cols = [\n",
|
|
540
|
+
" name for name, col in findings.columns.items()\n",
|
|
541
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
542
|
+
" and name != target\n",
|
|
543
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
544
|
+
" ]\n",
|
|
545
|
+
" \n",
|
|
546
|
+
" if feature_cols:\n",
|
|
547
|
+
" # Create box plots - one subplot per feature for better control\n",
|
|
548
|
+
" n_features = min(len(feature_cols), 6)\n",
|
|
549
|
+
" \n",
|
|
550
|
+
" fig = make_subplots(\n",
|
|
551
|
+
" rows=1, cols=n_features,\n",
|
|
552
|
+
" subplot_titles=feature_cols[:n_features],\n",
|
|
553
|
+
" horizontal_spacing=0.05\n",
|
|
554
|
+
" )\n",
|
|
555
|
+
" \n",
|
|
556
|
+
" for i, col in enumerate(feature_cols[:n_features]):\n",
|
|
557
|
+
" col_num = i + 1\n",
|
|
558
|
+
" \n",
|
|
559
|
+
" # Retained (1) - Green\n",
|
|
560
|
+
" retained_data = df[df[target] == 1][col].dropna()\n",
|
|
561
|
+
" fig.add_trace(\n",
|
|
562
|
+
" go.Box(\n",
|
|
563
|
+
" y=retained_data,\n",
|
|
564
|
+
" name='Retained',\n",
|
|
565
|
+
" fillcolor='rgba(46, 204, 113, 0.7)',\n",
|
|
566
|
+
" line=dict(color='#1e8449', width=2),\n",
|
|
567
|
+
" marker=dict(\n",
|
|
568
|
+
" color='rgba(46, 204, 113, 0.5)', # Light green outliers\n",
|
|
569
|
+
" size=5,\n",
|
|
570
|
+
" line=dict(color='#1e8449', width=1)\n",
|
|
571
|
+
" ),\n",
|
|
572
|
+
" boxpoints='outliers',\n",
|
|
573
|
+
" width=0.35,\n",
|
|
574
|
+
" showlegend=(i == 0),\n",
|
|
575
|
+
" legendgroup='retained',\n",
|
|
576
|
+
" offsetgroup='retained'\n",
|
|
577
|
+
" ),\n",
|
|
578
|
+
" row=1, col=col_num\n",
|
|
579
|
+
" )\n",
|
|
580
|
+
" \n",
|
|
581
|
+
" # Churned (0) - Red\n",
|
|
582
|
+
" churned_data = df[df[target] == 0][col].dropna()\n",
|
|
583
|
+
" fig.add_trace(\n",
|
|
584
|
+
" go.Box(\n",
|
|
585
|
+
" y=churned_data,\n",
|
|
586
|
+
" name='Churned',\n",
|
|
587
|
+
" fillcolor='rgba(231, 76, 60, 0.7)',\n",
|
|
588
|
+
" line=dict(color='#922b21', width=2),\n",
|
|
589
|
+
" marker=dict(\n",
|
|
590
|
+
" color='rgba(231, 76, 60, 0.5)', # Light red outliers\n",
|
|
591
|
+
" size=5,\n",
|
|
592
|
+
" line=dict(color='#922b21', width=1)\n",
|
|
593
|
+
" ),\n",
|
|
594
|
+
" boxpoints='outliers',\n",
|
|
595
|
+
" width=0.35,\n",
|
|
596
|
+
" showlegend=(i == 0),\n",
|
|
597
|
+
" legendgroup='churned',\n",
|
|
598
|
+
" offsetgroup='churned'\n",
|
|
599
|
+
" ),\n",
|
|
600
|
+
" row=1, col=col_num\n",
|
|
601
|
+
" )\n",
|
|
602
|
+
" \n",
|
|
603
|
+
" fig.update_layout(\n",
|
|
604
|
+
" height=450,\n",
|
|
605
|
+
" title_text=\"Feature Distributions: Retained (Green) vs Churned (Red)\",\n",
|
|
606
|
+
" template='plotly_white',\n",
|
|
607
|
+
" showlegend=True,\n",
|
|
608
|
+
" legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.05, xanchor=\"center\", x=0.5),\n",
|
|
609
|
+
" boxmode='group',\n",
|
|
610
|
+
" boxgap=0.3,\n",
|
|
611
|
+
" boxgroupgap=0.1\n",
|
|
612
|
+
" )\n",
|
|
613
|
+
" \n",
|
|
614
|
+
" # Center the boxes by removing x-axis tick labels (title is above each subplot)\n",
|
|
615
|
+
" fig.update_xaxes(showticklabels=False)\n",
|
|
616
|
+
" \n",
|
|
617
|
+
" display_figure(fig)\n",
|
|
618
|
+
" \n",
|
|
619
|
+
" # Print mean comparison\n",
|
|
620
|
+
" print(\"\\n📊 MEAN COMPARISON BY RETENTION STATUS:\")\n",
|
|
621
|
+
" print(\"-\" * 70)\n",
|
|
622
|
+
" for col in feature_cols[:n_features]:\n",
|
|
623
|
+
" retained_mean = df[df[target] == 1][col].mean()\n",
|
|
624
|
+
" churned_mean = df[df[target] == 0][col].mean()\n",
|
|
625
|
+
" diff_pct = ((retained_mean - churned_mean) / churned_mean * 100) if churned_mean != 0 else 0\n",
|
|
626
|
+
" print(f\" {col}:\")\n",
|
|
627
|
+
" print(f\" Retained: {retained_mean:.2f} | Churned: {churned_mean:.2f} | Diff: {diff_pct:+.1f}%\")"
|
|
628
|
+
]
|
|
629
|
+
},
|
|
630
|
+
{
|
|
631
|
+
"cell_type": "markdown",
|
|
632
|
+
"id": "b19f7661",
|
|
633
|
+
"metadata": {
|
|
634
|
+
"papermill": {
|
|
635
|
+
"duration": 0.007437,
|
|
636
|
+
"end_time": "2026-02-02T13:03:16.829786",
|
|
637
|
+
"exception": false,
|
|
638
|
+
"start_time": "2026-02-02T13:03:16.822349",
|
|
639
|
+
"status": "completed"
|
|
640
|
+
},
|
|
641
|
+
"tags": []
|
|
642
|
+
},
|
|
643
|
+
"source": [
|
|
644
|
+
"## 4.5 Feature-Target Correlations\n",
|
|
645
|
+
"\n",
|
|
646
|
+
"Features ranked by absolute correlation with the target variable.\n",
|
|
647
|
+
"\n",
|
|
648
|
+
"**📖 Interpretation:**\n",
|
|
649
|
+
"- **Positive correlation**: Higher values = more likely retained\n",
|
|
650
|
+
"- **Negative correlation**: Higher values = more likely churned\n",
|
|
651
|
+
"- **|r| > 0.3**: Moderately predictive\n",
|
|
652
|
+
"- **|r| > 0.5**: Strongly predictive"
|
|
653
|
+
]
|
|
654
|
+
},
|
|
655
|
+
{
|
|
656
|
+
"cell_type": "code",
|
|
657
|
+
"execution_count": null,
|
|
658
|
+
"id": "f718d3c3",
|
|
659
|
+
"metadata": {
|
|
660
|
+
"execution": {
|
|
661
|
+
"iopub.execute_input": "2026-02-02T13:03:16.845374Z",
|
|
662
|
+
"iopub.status.busy": "2026-02-02T13:03:16.845243Z",
|
|
663
|
+
"iopub.status.idle": "2026-02-02T13:03:16.868708Z",
|
|
664
|
+
"shell.execute_reply": "2026-02-02T13:03:16.868210Z"
|
|
665
|
+
},
|
|
666
|
+
"papermill": {
|
|
667
|
+
"duration": 0.032366,
|
|
668
|
+
"end_time": "2026-02-02T13:03:16.869546",
|
|
669
|
+
"exception": false,
|
|
670
|
+
"start_time": "2026-02-02T13:03:16.837180",
|
|
671
|
+
"status": "completed"
|
|
672
|
+
},
|
|
673
|
+
"tags": []
|
|
674
|
+
},
|
|
675
|
+
"outputs": [],
|
|
676
|
+
"source": [
|
|
677
|
+
"if findings.target_column and findings.target_column in df.columns:\n",
|
|
678
|
+
" target = findings.target_column\n",
|
|
679
|
+
" feature_cols = [\n",
|
|
680
|
+
" name for name, col in findings.columns.items()\n",
|
|
681
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
682
|
+
" and name != target\n",
|
|
683
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
684
|
+
" ]\n",
|
|
685
|
+
" \n",
|
|
686
|
+
" if feature_cols:\n",
|
|
687
|
+
" correlations = []\n",
|
|
688
|
+
" for col in feature_cols:\n",
|
|
689
|
+
" corr = df[[col, target]].corr().iloc[0, 1]\n",
|
|
690
|
+
" correlations.append({\"Feature\": col, \"Correlation\": corr})\n",
|
|
691
|
+
" \n",
|
|
692
|
+
" corr_df = pd.DataFrame(correlations).sort_values(\"Correlation\", key=abs, ascending=False)\n",
|
|
693
|
+
" \n",
|
|
694
|
+
" fig = charts.bar_chart(\n",
|
|
695
|
+
" corr_df[\"Feature\"].tolist(),\n",
|
|
696
|
+
" corr_df[\"Correlation\"].tolist(),\n",
|
|
697
|
+
" title=f\"Feature Correlations with {target}\"\n",
|
|
698
|
+
" )\n",
|
|
699
|
+
" display_figure(fig)\n",
|
|
700
|
+
"else:\n",
|
|
701
|
+
" print(\"Target column not available for correlation analysis.\")"
|
|
702
|
+
]
|
|
703
|
+
},
|
|
704
|
+
{
|
|
705
|
+
"cell_type": "markdown",
|
|
706
|
+
"id": "d41da224",
|
|
707
|
+
"metadata": {
|
|
708
|
+
"papermill": {
|
|
709
|
+
"duration": 0.009731,
|
|
710
|
+
"end_time": "2026-02-02T13:03:16.888030",
|
|
711
|
+
"exception": false,
|
|
712
|
+
"start_time": "2026-02-02T13:03:16.878299",
|
|
713
|
+
"status": "completed"
|
|
714
|
+
},
|
|
715
|
+
"tags": []
|
|
716
|
+
},
|
|
717
|
+
"source": [
|
|
718
|
+
"## 4.6 Categorical Feature Analysis\n",
|
|
719
|
+
"\n",
|
|
720
|
+
"Retention rates by category help identify which segments are at higher risk.\n",
|
|
721
|
+
"\n",
|
|
722
|
+
"**📖 What to Look For:**\n",
|
|
723
|
+
"- Categories with **low retention rates** = high-risk segments for intervention\n",
|
|
724
|
+
"- **Large variation** across categories = strong predictive feature\n",
|
|
725
|
+
"- **Small categories** with extreme rates may be unreliable (small sample size)\n",
|
|
726
|
+
"\n",
|
|
727
|
+
"**📊 Metrics Explained:**\n",
|
|
728
|
+
"- **Retention Rate**: % of customers in category who were retained\n",
|
|
729
|
+
"- **Lift**: How much better/worse than overall retention rate (>1 = better, <1 = worse)\n",
|
|
730
|
+
"- **Cramér's V**: Strength of association (0-1 scale, like correlation for categorical)"
|
|
731
|
+
]
|
|
732
|
+
},
|
|
733
|
+
{
|
|
734
|
+
"cell_type": "code",
|
|
735
|
+
"execution_count": null,
|
|
736
|
+
"id": "e1ccb220",
|
|
737
|
+
"metadata": {
|
|
738
|
+
"execution": {
|
|
739
|
+
"iopub.execute_input": "2026-02-02T13:03:16.906720Z",
|
|
740
|
+
"iopub.status.busy": "2026-02-02T13:03:16.906590Z",
|
|
741
|
+
"iopub.status.idle": "2026-02-02T13:03:16.957148Z",
|
|
742
|
+
"shell.execute_reply": "2026-02-02T13:03:16.956750Z"
|
|
743
|
+
},
|
|
744
|
+
"papermill": {
|
|
745
|
+
"duration": 0.061002,
|
|
746
|
+
"end_time": "2026-02-02T13:03:16.957922",
|
|
747
|
+
"exception": false,
|
|
748
|
+
"start_time": "2026-02-02T13:03:16.896920",
|
|
749
|
+
"status": "completed"
|
|
750
|
+
},
|
|
751
|
+
"tags": []
|
|
752
|
+
},
|
|
753
|
+
"outputs": [],
|
|
754
|
+
"source": [
|
|
755
|
+
"from customer_retention.stages.profiling import CategoricalTargetAnalyzer\n",
|
|
756
|
+
"\n",
|
|
757
|
+
"if findings.target_column:\n",
|
|
758
|
+
" target = findings.target_column\n",
|
|
759
|
+
" overall_retention = df[target].mean()\n",
|
|
760
|
+
" \n",
|
|
761
|
+
" categorical_cols = [\n",
|
|
762
|
+
" name for name, col in findings.columns.items()\n",
|
|
763
|
+
" if col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]\n",
|
|
764
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
765
|
+
" ]\n",
|
|
766
|
+
" \n",
|
|
767
|
+
" print(\"=\" * 80)\n",
|
|
768
|
+
" print(\"CATEGORICAL FEATURE ANALYSIS\")\n",
|
|
769
|
+
" print(\"=\" * 80)\n",
|
|
770
|
+
" print(f\"Overall retention rate: {overall_retention:.1%}\")\n",
|
|
771
|
+
" \n",
|
|
772
|
+
" if categorical_cols:\n",
|
|
773
|
+
" # Use framework analyzer for summary\n",
|
|
774
|
+
" cat_analyzer = CategoricalTargetAnalyzer(min_samples_per_category=10)\n",
|
|
775
|
+
" summary_df = cat_analyzer.analyze_multiple(df, categorical_cols, target)\n",
|
|
776
|
+
" \n",
|
|
777
|
+
" print(\"\\n📈 Categorical Feature Strength (Cramér's V):\")\n",
|
|
778
|
+
" print(\"-\" * 60)\n",
|
|
779
|
+
" for _, row in summary_df.iterrows():\n",
|
|
780
|
+
" if row[\"cramers_v\"] >= 0.3:\n",
|
|
781
|
+
" strength = \"Strong\"\n",
|
|
782
|
+
" emoji = \"🔴\"\n",
|
|
783
|
+
" elif row[\"cramers_v\"] >= 0.1:\n",
|
|
784
|
+
" strength = \"Moderate\"\n",
|
|
785
|
+
" emoji = \"🟡\"\n",
|
|
786
|
+
" else:\n",
|
|
787
|
+
" strength = \"Weak\"\n",
|
|
788
|
+
" emoji = \"🟢\"\n",
|
|
789
|
+
" sig = \"***\" if row[\"p_value\"] < 0.001 else \"**\" if row[\"p_value\"] < 0.01 else \"*\" if row[\"p_value\"] < 0.05 else \"\"\n",
|
|
790
|
+
" print(f\" {emoji} {row['feature']}: V={row['cramers_v']:.3f} ({strength}) {sig}\")\n",
|
|
791
|
+
" \n",
|
|
792
|
+
" # Detailed analysis for each categorical feature\n",
|
|
793
|
+
" for col_name in categorical_cols[:5]:\n",
|
|
794
|
+
" result = cat_analyzer.analyze(df, col_name, target)\n",
|
|
795
|
+
" \n",
|
|
796
|
+
" print(f\"\\n{'='*60}\")\n",
|
|
797
|
+
" print(f\"📊 {col_name.upper()}\")\n",
|
|
798
|
+
" print(\"=\"*60)\n",
|
|
799
|
+
" \n",
|
|
800
|
+
" # Display stats table\n",
|
|
801
|
+
" if len(result.category_stats) > 0:\n",
|
|
802
|
+
" display_stats = result.category_stats[['category', 'total_count', 'retention_rate', 'lift', 'pct_of_total']].copy()\n",
|
|
803
|
+
" display_stats['retention_rate'] = display_stats['retention_rate'].apply(lambda x: f\"{x:.1%}\")\n",
|
|
804
|
+
" display_stats['lift'] = display_stats['lift'].apply(lambda x: f\"{x:.2f}x\")\n",
|
|
805
|
+
" display_stats['pct_of_total'] = display_stats['pct_of_total'].apply(lambda x: f\"{x:.1%}\")\n",
|
|
806
|
+
" display_stats.columns = [col_name, 'Count', 'Retention Rate', 'Lift', '% of Data']\n",
|
|
807
|
+
" display(display_stats)\n",
|
|
808
|
+
" \n",
|
|
809
|
+
" # Stacked bar chart\n",
|
|
810
|
+
" cat_stats = result.category_stats\n",
|
|
811
|
+
" categories = cat_stats['category'].tolist()\n",
|
|
812
|
+
" retained_counts = cat_stats['retained_count'].tolist()\n",
|
|
813
|
+
" churned_counts = cat_stats['churned_count'].tolist()\n",
|
|
814
|
+
" \n",
|
|
815
|
+
" fig = go.Figure()\n",
|
|
816
|
+
" \n",
|
|
817
|
+
" fig.add_trace(go.Bar(\n",
|
|
818
|
+
" name='Retained',\n",
|
|
819
|
+
" x=categories,\n",
|
|
820
|
+
" y=retained_counts,\n",
|
|
821
|
+
" marker_color='rgba(46, 204, 113, 0.8)',\n",
|
|
822
|
+
" text=[f\"{r/(r+c)*100:.0f}%\" for r, c in zip(retained_counts, churned_counts)],\n",
|
|
823
|
+
" textposition='inside',\n",
|
|
824
|
+
" textfont=dict(color='white', size=12)\n",
|
|
825
|
+
" ))\n",
|
|
826
|
+
" \n",
|
|
827
|
+
" fig.add_trace(go.Bar(\n",
|
|
828
|
+
" name='Churned',\n",
|
|
829
|
+
" x=categories,\n",
|
|
830
|
+
" y=churned_counts,\n",
|
|
831
|
+
" marker_color='rgba(231, 76, 60, 0.8)',\n",
|
|
832
|
+
" text=[f\"{c/(r+c)*100:.0f}%\" for r, c in zip(retained_counts, churned_counts)],\n",
|
|
833
|
+
" textposition='inside',\n",
|
|
834
|
+
" textfont=dict(color='white', size=12)\n",
|
|
835
|
+
" ))\n",
|
|
836
|
+
" \n",
|
|
837
|
+
" fig.update_layout(\n",
|
|
838
|
+
" barmode='stack',\n",
|
|
839
|
+
" title=f\"Retention by {col_name}\",\n",
|
|
840
|
+
" xaxis_title=col_name,\n",
|
|
841
|
+
" yaxis_title=\"Count\",\n",
|
|
842
|
+
" template='plotly_white',\n",
|
|
843
|
+
" height=350,\n",
|
|
844
|
+
" legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.02, xanchor=\"center\", x=0.5)\n",
|
|
845
|
+
" )\n",
|
|
846
|
+
" display_figure(fig)\n",
|
|
847
|
+
" \n",
|
|
848
|
+
" # Flag high-risk categories from framework result\n",
|
|
849
|
+
" if result.high_risk_categories:\n",
|
|
850
|
+
" print(f\"\\n ⚠️ High-risk categories (lift < 0.9x):\")\n",
|
|
851
|
+
" for cat in result.high_risk_categories:\n",
|
|
852
|
+
" cat_row = cat_stats[cat_stats['category'] == cat].iloc[0]\n",
|
|
853
|
+
" print(f\" • {cat}: {cat_row['retention_rate']:.1%} retention ({cat_row['lift']:.2f}x lift)\")\n",
|
|
854
|
+
" else:\n",
|
|
855
|
+
" print(\"\\n ℹ️ No categorical columns detected.\")\n",
|
|
856
|
+
"else:\n",
|
|
857
|
+
" print(\"No target column available for categorical analysis.\")"
|
|
858
|
+
]
|
|
859
|
+
},
|
|
860
|
+
{
|
|
861
|
+
"cell_type": "markdown",
|
|
862
|
+
"id": "d65f086f",
|
|
863
|
+
"metadata": {
|
|
864
|
+
"papermill": {
|
|
865
|
+
"duration": 0.012264,
|
|
866
|
+
"end_time": "2026-02-02T13:03:16.982488",
|
|
867
|
+
"exception": false,
|
|
868
|
+
"start_time": "2026-02-02T13:03:16.970224",
|
|
869
|
+
"status": "completed"
|
|
870
|
+
},
|
|
871
|
+
"tags": []
|
|
872
|
+
},
|
|
873
|
+
"source": [
|
|
874
|
+
"## 4.7 Scatter Plot Matrix (Sample)\n",
|
|
875
|
+
"\n",
|
|
876
|
+
"Visual exploration of pairwise relationships between numeric features.\n",
|
|
877
|
+
"\n",
|
|
878
|
+
"**📖 How to Read the Scatter Matrix:**\n",
|
|
879
|
+
"- **Diagonal**: Distribution of each feature (histogram or density)\n",
|
|
880
|
+
"- **Off-diagonal**: Scatter plot showing relationship between two features\n",
|
|
881
|
+
"- Each row/column represents one feature\n",
|
|
882
|
+
"\n",
|
|
883
|
+
"**🔍 What to Look For:**\n",
|
|
884
|
+
"\n",
|
|
885
|
+
"| Pattern | What It Means | Action |\n",
|
|
886
|
+
"|---------|--------------|--------|\n",
|
|
887
|
+
"| **Linear trend** (diagonal line of points) | Strong correlation | Check if redundant; may cause multicollinearity |\n",
|
|
888
|
+
"| **Curved pattern** | Non-linear relationship | Consider polynomial features or transformations |\n",
|
|
889
|
+
"| **Clusters/groups** | Natural segments in data | May benefit from segment-aware modeling |\n",
|
|
890
|
+
"| **Fan shape** (spreading out) | Heteroscedasticity | May need log transform or robust methods |\n",
|
|
891
|
+
"| **Random scatter** | No relationship | Features are independent |\n",
|
|
892
|
+
"\n",
|
|
893
|
+
"**⚠️ Cautions:**\n",
|
|
894
|
+
"- Sample shown (max 1000 points) for performance - patterns may differ in full data\n",
|
|
895
|
+
"- Look for the same patterns in correlation matrix (section 4.2) to confirm"
|
|
896
|
+
]
|
|
897
|
+
},
|
|
898
|
+
{
|
|
899
|
+
"cell_type": "code",
|
|
900
|
+
"execution_count": null,
|
|
901
|
+
"id": "dd026d92",
|
|
902
|
+
"metadata": {
|
|
903
|
+
"execution": {
|
|
904
|
+
"iopub.execute_input": "2026-02-02T13:03:17.008415Z",
|
|
905
|
+
"iopub.status.busy": "2026-02-02T13:03:17.008306Z",
|
|
906
|
+
"iopub.status.idle": "2026-02-02T13:03:17.032972Z",
|
|
907
|
+
"shell.execute_reply": "2026-02-02T13:03:17.032460Z"
|
|
908
|
+
},
|
|
909
|
+
"papermill": {
|
|
910
|
+
"duration": 0.038406,
|
|
911
|
+
"end_time": "2026-02-02T13:03:17.033612",
|
|
912
|
+
"exception": false,
|
|
913
|
+
"start_time": "2026-02-02T13:03:16.995206",
|
|
914
|
+
"status": "completed"
|
|
915
|
+
},
|
|
916
|
+
"tags": []
|
|
917
|
+
},
|
|
918
|
+
"outputs": [],
|
|
919
|
+
"source": [
|
|
920
|
+
"top_numeric = numeric_cols[:4] if len(numeric_cols) > 4 else numeric_cols\n",
|
|
921
|
+
"\n",
|
|
922
|
+
"if len(top_numeric) >= 2:\n",
|
|
923
|
+
" fig = charts.scatter_matrix(\n",
|
|
924
|
+
" df[top_numeric].sample(min(1000, len(df))),\n",
|
|
925
|
+
" title=\"Scatter Plot Matrix (Sample)\"\n",
|
|
926
|
+
" )\n",
|
|
927
|
+
" display_figure(fig)"
|
|
928
|
+
]
|
|
929
|
+
},
|
|
930
|
+
{
|
|
931
|
+
"cell_type": "markdown",
|
|
932
|
+
"id": "dc50713b",
|
|
933
|
+
"metadata": {
|
|
934
|
+
"papermill": {
|
|
935
|
+
"duration": 0.013511,
|
|
936
|
+
"end_time": "2026-02-02T13:03:17.060685",
|
|
937
|
+
"exception": false,
|
|
938
|
+
"start_time": "2026-02-02T13:03:17.047174",
|
|
939
|
+
"status": "completed"
|
|
940
|
+
},
|
|
941
|
+
"tags": []
|
|
942
|
+
},
|
|
943
|
+
"source": [
|
|
944
|
+
"### Interpreting the Scatter Matrix Above\n",
|
|
945
|
+
"\n",
|
|
946
|
+
"**🎯 Key Questions to Answer:**\n",
|
|
947
|
+
"\n",
|
|
948
|
+
"1. **Are any features redundant?**\n",
|
|
949
|
+
" - Look for tight linear patterns → high correlation → consider dropping one\n",
|
|
950
|
+
" - Cross-reference with high correlation pairs in section 4.3\n",
|
|
951
|
+
"\n",
|
|
952
|
+
"2. **Are there natural customer segments?**\n",
|
|
953
|
+
" - Distinct clusters suggest different customer types\n",
|
|
954
|
+
" - Links to segment-aware outlier analysis in notebook 03\n",
|
|
955
|
+
"\n",
|
|
956
|
+
"3. **Do relationships suggest feature engineering?**\n",
|
|
957
|
+
" - Curved patterns → polynomial or interaction terms may help\n",
|
|
958
|
+
" - Ratios between correlated features may be more predictive\n",
|
|
959
|
+
"\n",
|
|
960
|
+
"4. **Are distributions suitable for linear models?**\n",
|
|
961
|
+
" - Fan shapes or heavy skew → consider transformations\n",
|
|
962
|
+
" - Outlier clusters → verify with segment analysis\n",
|
|
963
|
+
"\n",
|
|
964
|
+
"**💡 Pro Tip:** Hover over points in the interactive plot to see exact values. Look for outliers that appear across multiple scatter plots - these may be influential observations worth investigating."
|
|
965
|
+
]
|
|
966
|
+
},
|
|
967
|
+
{
|
|
968
|
+
"cell_type": "markdown",
|
|
969
|
+
"id": "2ef5f64a",
|
|
970
|
+
"metadata": {
|
|
971
|
+
"papermill": {
|
|
972
|
+
"duration": 0.013946,
|
|
973
|
+
"end_time": "2026-02-02T13:03:17.087881",
|
|
974
|
+
"exception": false,
|
|
975
|
+
"start_time": "2026-02-02T13:03:17.073935",
|
|
976
|
+
"status": "completed"
|
|
977
|
+
},
|
|
978
|
+
"tags": []
|
|
979
|
+
},
|
|
980
|
+
"source": [
|
|
981
|
+
"## 4.8 Datetime Feature Analysis\n",
|
|
982
|
+
"\n",
|
|
983
|
+
"Temporal patterns can reveal important retention signals - when customers joined, their last activity, and seasonal patterns.\n",
|
|
984
|
+
"\n",
|
|
985
|
+
"**📖 What to Look For:**\n",
|
|
986
|
+
"- **Cohort effects**: Do customers who joined in certain periods have different retention?\n",
|
|
987
|
+
"- **Recency patterns**: How does time since last activity relate to retention?\n",
|
|
988
|
+
"- **Seasonal trends**: Are there monthly or quarterly patterns?\n",
|
|
989
|
+
"\n",
|
|
990
|
+
"**📊 Common Temporal Features:**\n",
|
|
991
|
+
"| Feature Type | Example | Typical Insight |\n",
|
|
992
|
+
"|-------------|---------|-----------------|\n",
|
|
993
|
+
"| **Tenure** | Days since signup | Longer tenure often = higher retention |\n",
|
|
994
|
+
"| **Recency** | Days since last order | Recent activity = engaged customer |\n",
|
|
995
|
+
"| **Cohort** | Signup month/year | Economic conditions affect cohorts |\n",
|
|
996
|
+
"| **Day of Week** | Signup day | Weekend vs weekday patterns |"
|
|
997
|
+
]
|
|
998
|
+
},
|
|
999
|
+
{
|
|
1000
|
+
"cell_type": "code",
|
|
1001
|
+
"execution_count": null,
|
|
1002
|
+
"id": "71e2d38c",
|
|
1003
|
+
"metadata": {
|
|
1004
|
+
"execution": {
|
|
1005
|
+
"iopub.execute_input": "2026-02-02T13:03:17.116500Z",
|
|
1006
|
+
"iopub.status.busy": "2026-02-02T13:03:17.116364Z",
|
|
1007
|
+
"iopub.status.idle": "2026-02-02T13:03:17.123894Z",
|
|
1008
|
+
"shell.execute_reply": "2026-02-02T13:03:17.123169Z"
|
|
1009
|
+
},
|
|
1010
|
+
"papermill": {
|
|
1011
|
+
"duration": 0.023284,
|
|
1012
|
+
"end_time": "2026-02-02T13:03:17.124497",
|
|
1013
|
+
"exception": false,
|
|
1014
|
+
"start_time": "2026-02-02T13:03:17.101213",
|
|
1015
|
+
"status": "completed"
|
|
1016
|
+
},
|
|
1017
|
+
"tags": []
|
|
1018
|
+
},
|
|
1019
|
+
"outputs": [],
|
|
1020
|
+
"source": [
|
|
1021
|
+
"from customer_retention.stages.profiling import TemporalTargetAnalyzer\n",
|
|
1022
|
+
"\n",
|
|
1023
|
+
"datetime_cols = [\n",
|
|
1024
|
+
" name for name, col in findings.columns.items()\n",
|
|
1025
|
+
" if col.inferred_type == ColumnType.DATETIME\n",
|
|
1026
|
+
"]\n",
|
|
1027
|
+
"\n",
|
|
1028
|
+
"print(\"=\" * 80)\n",
|
|
1029
|
+
"print(\"DATETIME FEATURE ANALYSIS\")\n",
|
|
1030
|
+
"print(\"=\" * 80)\n",
|
|
1031
|
+
"print(f\"Detected datetime columns: {datetime_cols}\")\n",
|
|
1032
|
+
"\n",
|
|
1033
|
+
"if datetime_cols and findings.target_column:\n",
|
|
1034
|
+
" target = findings.target_column\n",
|
|
1035
|
+
" overall_retention = df[target].mean()\n",
|
|
1036
|
+
" \n",
|
|
1037
|
+
" # Use framework analyzer\n",
|
|
1038
|
+
" temporal_analyzer = TemporalTargetAnalyzer(min_samples_per_period=10)\n",
|
|
1039
|
+
" \n",
|
|
1040
|
+
" for col_name in datetime_cols[:3]:\n",
|
|
1041
|
+
" result = temporal_analyzer.analyze(df, col_name, target)\n",
|
|
1042
|
+
" \n",
|
|
1043
|
+
" print(f\"\\n{'='*60}\")\n",
|
|
1044
|
+
" print(f\"📅 {col_name.upper()}\")\n",
|
|
1045
|
+
" print(\"=\"*60)\n",
|
|
1046
|
+
" \n",
|
|
1047
|
+
" if result.n_valid_dates == 0:\n",
|
|
1048
|
+
" print(\" No valid dates found\")\n",
|
|
1049
|
+
" continue\n",
|
|
1050
|
+
" \n",
|
|
1051
|
+
" print(f\" Date range: {result.min_date} to {result.max_date}\")\n",
|
|
1052
|
+
" print(f\" Valid dates: {result.n_valid_dates:,}\")\n",
|
|
1053
|
+
" \n",
|
|
1054
|
+
" # 1. Retention by Year (from framework result)\n",
|
|
1055
|
+
" if len(result.yearly_stats) > 1:\n",
|
|
1056
|
+
" print(f\"\\n 📊 Retention by Year: Trend is {result.yearly_trend}\")\n",
|
|
1057
|
+
" \n",
|
|
1058
|
+
" year_stats = result.yearly_stats\n",
|
|
1059
|
+
" \n",
|
|
1060
|
+
" fig = make_subplots(rows=1, cols=2, subplot_titles=[\"Retention Rate by Year\", \"Customer Count by Year\"],\n",
|
|
1061
|
+
" column_widths=[0.6, 0.4])\n",
|
|
1062
|
+
" \n",
|
|
1063
|
+
" fig.add_trace(\n",
|
|
1064
|
+
" go.Scatter(\n",
|
|
1065
|
+
" x=year_stats['period'].astype(str),\n",
|
|
1066
|
+
" y=year_stats['retention_rate'],\n",
|
|
1067
|
+
" mode='lines+markers',\n",
|
|
1068
|
+
" name='Retention Rate',\n",
|
|
1069
|
+
" line=dict(color='#3498db', width=3),\n",
|
|
1070
|
+
" marker=dict(size=10)\n",
|
|
1071
|
+
" ),\n",
|
|
1072
|
+
" row=1, col=1\n",
|
|
1073
|
+
" )\n",
|
|
1074
|
+
" fig.add_hline(y=overall_retention, line_dash=\"dash\", line_color=\"gray\",\n",
|
|
1075
|
+
" annotation_text=f\"Overall: {overall_retention:.1%}\", row=1, col=1)\n",
|
|
1076
|
+
" \n",
|
|
1077
|
+
" fig.add_trace(\n",
|
|
1078
|
+
" go.Bar(\n",
|
|
1079
|
+
" x=year_stats['period'].astype(str),\n",
|
|
1080
|
+
" y=year_stats['count'],\n",
|
|
1081
|
+
" name='Count',\n",
|
|
1082
|
+
" marker_color='rgba(52, 152, 219, 0.6)'\n",
|
|
1083
|
+
" ),\n",
|
|
1084
|
+
" row=1, col=2\n",
|
|
1085
|
+
" )\n",
|
|
1086
|
+
" \n",
|
|
1087
|
+
" fig.update_layout(height=350, template='plotly_white', showlegend=False)\n",
|
|
1088
|
+
" fig.update_yaxes(tickformat='.0%', row=1, col=1)\n",
|
|
1089
|
+
" display_figure(fig)\n",
|
|
1090
|
+
" \n",
|
|
1091
|
+
" # 2. Retention by Month (from framework result)\n",
|
|
1092
|
+
" if len(result.monthly_stats) > 1:\n",
|
|
1093
|
+
" print(f\"\\n 📊 Retention by Month (Seasonality):\")\n",
|
|
1094
|
+
" \n",
|
|
1095
|
+
" month_stats = result.monthly_stats\n",
|
|
1096
|
+
" colors = ['rgba(46, 204, 113, 0.7)' if r >= overall_retention else 'rgba(231, 76, 60, 0.7)' \n",
|
|
1097
|
+
" for r in month_stats['retention_rate']]\n",
|
|
1098
|
+
" \n",
|
|
1099
|
+
" fig = go.Figure()\n",
|
|
1100
|
+
" fig.add_trace(go.Bar(\n",
|
|
1101
|
+
" x=month_stats['month_name'],\n",
|
|
1102
|
+
" y=month_stats['retention_rate'],\n",
|
|
1103
|
+
" marker_color=colors,\n",
|
|
1104
|
+
" text=[f\"{r:.0%}\" for r in month_stats['retention_rate']],\n",
|
|
1105
|
+
" textposition='outside'\n",
|
|
1106
|
+
" ))\n",
|
|
1107
|
+
" fig.add_hline(y=overall_retention, line_dash=\"dash\", line_color=\"gray\",\n",
|
|
1108
|
+
" annotation_text=f\"Overall: {overall_retention:.1%}\")\n",
|
|
1109
|
+
" \n",
|
|
1110
|
+
" fig.update_layout(\n",
|
|
1111
|
+
" title=f\"Monthly Retention Pattern ({col_name})\",\n",
|
|
1112
|
+
" xaxis_title=\"Month\",\n",
|
|
1113
|
+
" yaxis_title=\"Retention Rate\",\n",
|
|
1114
|
+
" template='plotly_white',\n",
|
|
1115
|
+
" height=350,\n",
|
|
1116
|
+
" yaxis_tickformat='.0%'\n",
|
|
1117
|
+
" )\n",
|
|
1118
|
+
" display_figure(fig)\n",
|
|
1119
|
+
" \n",
|
|
1120
|
+
" # Seasonal insights from framework\n",
|
|
1121
|
+
" if result.seasonal_spread > 0.05:\n",
|
|
1122
|
+
" print(f\" 📈 Seasonal spread: {result.seasonal_spread:.1%}\")\n",
|
|
1123
|
+
" print(f\" Best month: {result.best_month}\")\n",
|
|
1124
|
+
" print(f\" Worst month: {result.worst_month}\")\n",
|
|
1125
|
+
" \n",
|
|
1126
|
+
" # 3. Retention by Day of Week (from framework result)\n",
|
|
1127
|
+
" if len(result.dow_stats) > 1:\n",
|
|
1128
|
+
" print(f\"\\n 📊 Retention by Day of Week:\")\n",
|
|
1129
|
+
" \n",
|
|
1130
|
+
" dow_stats = result.dow_stats\n",
|
|
1131
|
+
" colors = ['rgba(46, 204, 113, 0.7)' if r >= overall_retention else 'rgba(231, 76, 60, 0.7)' \n",
|
|
1132
|
+
" for r in dow_stats['retention_rate']]\n",
|
|
1133
|
+
" \n",
|
|
1134
|
+
" fig = go.Figure()\n",
|
|
1135
|
+
" fig.add_trace(go.Bar(\n",
|
|
1136
|
+
" x=dow_stats['day_name'],\n",
|
|
1137
|
+
" y=dow_stats['retention_rate'],\n",
|
|
1138
|
+
" marker_color=colors,\n",
|
|
1139
|
+
" text=[f\"{r:.0%}\" for r in dow_stats['retention_rate']],\n",
|
|
1140
|
+
" textposition='outside'\n",
|
|
1141
|
+
" ))\n",
|
|
1142
|
+
" fig.add_hline(y=overall_retention, line_dash=\"dash\", line_color=\"gray\")\n",
|
|
1143
|
+
" \n",
|
|
1144
|
+
" fig.update_layout(\n",
|
|
1145
|
+
" title=f\"Day of Week Pattern ({col_name})\",\n",
|
|
1146
|
+
" xaxis_title=\"Day of Week\",\n",
|
|
1147
|
+
" yaxis_title=\"Retention Rate\",\n",
|
|
1148
|
+
" template='plotly_white',\n",
|
|
1149
|
+
" height=300,\n",
|
|
1150
|
+
" yaxis_tickformat='.0%'\n",
|
|
1151
|
+
" )\n",
|
|
1152
|
+
" display_figure(fig)\n",
|
|
1153
|
+
"else:\n",
|
|
1154
|
+
" if not datetime_cols:\n",
|
|
1155
|
+
" print(\"\\n ℹ️ No datetime columns detected in this dataset.\")\n",
|
|
1156
|
+
" print(\" Consider adding date parsing in notebook 01 if dates exist as strings.\")\n",
|
|
1157
|
+
" else:\n",
|
|
1158
|
+
" print(\"\\n ℹ️ No target column available for retention analysis.\")"
|
|
1159
|
+
]
|
|
1160
|
+
},
|
|
1161
|
+
{
|
|
1162
|
+
"cell_type": "markdown",
|
|
1163
|
+
"id": "e119d650",
|
|
1164
|
+
"metadata": {
|
|
1165
|
+
"papermill": {
|
|
1166
|
+
"duration": 0.013421,
|
|
1167
|
+
"end_time": "2026-02-02T13:03:17.151526",
|
|
1168
|
+
"exception": false,
|
|
1169
|
+
"start_time": "2026-02-02T13:03:17.138105",
|
|
1170
|
+
"status": "completed"
|
|
1171
|
+
},
|
|
1172
|
+
"tags": []
|
|
1173
|
+
},
|
|
1174
|
+
"source": [
|
|
1175
|
+
"## 4.9 Actionable Recommendations Summary\n",
|
|
1176
|
+
"\n",
|
|
1177
|
+
"This section consolidates all relationship analysis findings into **actionable recommendations** organized by their impact on the modeling pipeline.\n",
|
|
1178
|
+
"\n",
|
|
1179
|
+
"**📋 Recommendation Categories:**\n",
|
|
1180
|
+
"\n",
|
|
1181
|
+
"| Category | Purpose | Impact |\n",
|
|
1182
|
+
"|----------|---------|--------|\n",
|
|
1183
|
+
"| **Feature Selection** | Which features to keep/drop | Reduces noise, improves interpretability |\n",
|
|
1184
|
+
"| **Feature Engineering** | New features to create | Captures interactions, improves accuracy |\n",
|
|
1185
|
+
"| **Stratification** | Train/test split strategy | Ensures fair evaluation, prevents leakage |\n",
|
|
1186
|
+
"| **Model Selection** | Which algorithms to try | Matches model to data characteristics |"
|
|
1187
|
+
]
|
|
1188
|
+
},
|
|
1189
|
+
{
|
|
1190
|
+
"cell_type": "code",
|
|
1191
|
+
"execution_count": null,
|
|
1192
|
+
"id": "53cda4e0",
|
|
1193
|
+
"metadata": {
|
|
1194
|
+
"execution": {
|
|
1195
|
+
"iopub.execute_input": "2026-02-02T13:03:17.179882Z",
|
|
1196
|
+
"iopub.status.busy": "2026-02-02T13:03:17.179754Z",
|
|
1197
|
+
"iopub.status.idle": "2026-02-02T13:03:17.305613Z",
|
|
1198
|
+
"shell.execute_reply": "2026-02-02T13:03:17.305019Z"
|
|
1199
|
+
},
|
|
1200
|
+
"papermill": {
|
|
1201
|
+
"duration": 0.141468,
|
|
1202
|
+
"end_time": "2026-02-02T13:03:17.306177",
|
|
1203
|
+
"exception": false,
|
|
1204
|
+
"start_time": "2026-02-02T13:03:17.164709",
|
|
1205
|
+
"status": "completed"
|
|
1206
|
+
},
|
|
1207
|
+
"tags": []
|
|
1208
|
+
},
|
|
1209
|
+
"outputs": [],
|
|
1210
|
+
"source": [
|
|
1211
|
+
"# Generate comprehensive actionable recommendations\n",
|
|
1212
|
+
"recommender = RelationshipRecommender()\n",
|
|
1213
|
+
"\n",
|
|
1214
|
+
"# Gather columns by type\n",
|
|
1215
|
+
"numeric_features = [\n",
|
|
1216
|
+
" name for name, col in findings.columns.items()\n",
|
|
1217
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
1218
|
+
" and name != findings.target_column\n",
|
|
1219
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
1220
|
+
"]\n",
|
|
1221
|
+
"categorical_features = [\n",
|
|
1222
|
+
" name for name, col in findings.columns.items()\n",
|
|
1223
|
+
" if col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]\n",
|
|
1224
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
1225
|
+
"]\n",
|
|
1226
|
+
"\n",
|
|
1227
|
+
"# Run comprehensive analysis\n",
|
|
1228
|
+
"analysis_summary = recommender.analyze(\n",
|
|
1229
|
+
" df,\n",
|
|
1230
|
+
" numeric_cols=numeric_features,\n",
|
|
1231
|
+
" categorical_cols=categorical_features,\n",
|
|
1232
|
+
" target_col=findings.target_column,\n",
|
|
1233
|
+
")\n",
|
|
1234
|
+
"\n",
|
|
1235
|
+
"print(\"=\" * 80)\n",
|
|
1236
|
+
"print(\"ACTIONABLE RECOMMENDATIONS FROM RELATIONSHIP ANALYSIS\")\n",
|
|
1237
|
+
"print(\"=\" * 80)\n",
|
|
1238
|
+
"\n",
|
|
1239
|
+
"# Group recommendations by category\n",
|
|
1240
|
+
"grouped_recs = analysis_summary.recommendations_by_category\n",
|
|
1241
|
+
"high_priority = analysis_summary.high_priority_actions\n",
|
|
1242
|
+
"\n",
|
|
1243
|
+
"if high_priority:\n",
|
|
1244
|
+
" print(f\"\\n🔴 HIGH PRIORITY ACTIONS ({len(high_priority)}):\")\n",
|
|
1245
|
+
" print(\"-\" * 60)\n",
|
|
1246
|
+
" for rec in high_priority:\n",
|
|
1247
|
+
" print(f\"\\n 📌 {rec.title}\")\n",
|
|
1248
|
+
" print(f\" {rec.description}\")\n",
|
|
1249
|
+
" print(f\" → Action: {rec.action}\")\n",
|
|
1250
|
+
" if rec.affected_features:\n",
|
|
1251
|
+
" print(f\" → Features: {', '.join(rec.affected_features[:5])}\")\n",
|
|
1252
|
+
"\n",
|
|
1253
|
+
"# Persist recommendations to registry\n",
|
|
1254
|
+
"for pair in analysis_summary.multicollinear_pairs:\n",
|
|
1255
|
+
" registry.add_gold_drop_multicollinear(\n",
|
|
1256
|
+
" column=pair[\"feature1\"], correlated_with=pair[\"feature2\"],\n",
|
|
1257
|
+
" correlation=pair[\"correlation\"],\n",
|
|
1258
|
+
" rationale=f\"High correlation ({pair['correlation']:.2f}) - consider dropping one\",\n",
|
|
1259
|
+
" source_notebook=\"04_relationship_analysis\"\n",
|
|
1260
|
+
" )\n",
|
|
1261
|
+
"\n",
|
|
1262
|
+
"for predictor in analysis_summary.strong_predictors:\n",
|
|
1263
|
+
" registry.add_gold_prioritize_feature(\n",
|
|
1264
|
+
" column=predictor[\"feature\"], effect_size=predictor[\"effect_size\"],\n",
|
|
1265
|
+
" correlation=predictor[\"correlation\"],\n",
|
|
1266
|
+
" rationale=f\"Strong predictor with effect size {predictor['effect_size']:.2f}\",\n",
|
|
1267
|
+
" source_notebook=\"04_relationship_analysis\"\n",
|
|
1268
|
+
" )\n",
|
|
1269
|
+
"\n",
|
|
1270
|
+
"for weak_col in analysis_summary.weak_predictors[:10]:\n",
|
|
1271
|
+
" registry.add_gold_drop_weak(\n",
|
|
1272
|
+
" column=weak_col, effect_size=0.0, correlation=0.0,\n",
|
|
1273
|
+
" rationale=\"Negligible predictive power\",\n",
|
|
1274
|
+
" source_notebook=\"04_relationship_analysis\"\n",
|
|
1275
|
+
" )\n",
|
|
1276
|
+
"\n",
|
|
1277
|
+
"# Persist ratio feature recommendations\n",
|
|
1278
|
+
"for rec in grouped_recs.get(RecommendationCategory.FEATURE_ENGINEERING, []):\n",
|
|
1279
|
+
" if \"ratio\" in rec.title.lower() and len(rec.affected_features) >= 2:\n",
|
|
1280
|
+
" registry.add_silver_ratio(\n",
|
|
1281
|
+
" column=f\"{rec.affected_features[0]}_to_{rec.affected_features[1]}_ratio\",\n",
|
|
1282
|
+
" numerator=rec.affected_features[0], denominator=rec.affected_features[1],\n",
|
|
1283
|
+
" rationale=rec.description, source_notebook=\"04_relationship_analysis\"\n",
|
|
1284
|
+
" )\n",
|
|
1285
|
+
" elif \"interaction\" in rec.title.lower() and len(rec.affected_features) >= 2:\n",
|
|
1286
|
+
" for i, f1 in enumerate(rec.affected_features[:3]):\n",
|
|
1287
|
+
" for f2 in rec.affected_features[i+1:4]:\n",
|
|
1288
|
+
" registry.add_silver_interaction(\n",
|
|
1289
|
+
" column=f\"{f1}_x_{f2}\", features=[f1, f2],\n",
|
|
1290
|
+
" rationale=rec.description, source_notebook=\"04_relationship_analysis\"\n",
|
|
1291
|
+
" )\n",
|
|
1292
|
+
"\n",
|
|
1293
|
+
"# Store for findings metadata\n",
|
|
1294
|
+
"findings.metadata[\"relationship_analysis\"] = {\n",
|
|
1295
|
+
" \"n_recommendations\": len(analysis_summary.recommendations),\n",
|
|
1296
|
+
" \"n_high_priority\": len(high_priority),\n",
|
|
1297
|
+
" \"strong_predictors\": [p[\"feature\"] for p in analysis_summary.strong_predictors],\n",
|
|
1298
|
+
" \"weak_predictors\": analysis_summary.weak_predictors[:5],\n",
|
|
1299
|
+
" \"multicollinear_pairs\": [(p[\"feature1\"], p[\"feature2\"]) for p in analysis_summary.multicollinear_pairs],\n",
|
|
1300
|
+
"}\n",
|
|
1301
|
+
"\n",
|
|
1302
|
+
"print(f\"\\n✅ Persisted {len(analysis_summary.multicollinear_pairs)} multicollinearity recommendations\")\n",
|
|
1303
|
+
"print(f\"✅ Persisted {len(analysis_summary.strong_predictors)} strong predictor recommendations\")\n",
|
|
1304
|
+
"print(f\"✅ Persisted {min(len(analysis_summary.weak_predictors), 10)} weak predictor recommendations\")"
|
|
1305
|
+
]
|
|
1306
|
+
},
|
|
1307
|
+
{
|
|
1308
|
+
"cell_type": "markdown",
|
|
1309
|
+
"id": "9378cb05",
|
|
1310
|
+
"metadata": {
|
|
1311
|
+
"papermill": {
|
|
1312
|
+
"duration": 0.013961,
|
|
1313
|
+
"end_time": "2026-02-02T13:03:17.338912",
|
|
1314
|
+
"exception": false,
|
|
1315
|
+
"start_time": "2026-02-02T13:03:17.324951",
|
|
1316
|
+
"status": "completed"
|
|
1317
|
+
},
|
|
1318
|
+
"tags": []
|
|
1319
|
+
},
|
|
1320
|
+
"source": [
|
|
1321
|
+
"### 4.9.1 Feature Selection Recommendations\n",
|
|
1322
|
+
"\n",
|
|
1323
|
+
"**What these recommendations tell you:**\n",
|
|
1324
|
+
"- Which features to **prioritize** (strong predictors)\n",
|
|
1325
|
+
"- Which features to **consider dropping** (weak predictors, redundant features)\n",
|
|
1326
|
+
"- Which feature pairs cause **multicollinearity** issues\n",
|
|
1327
|
+
"\n",
|
|
1328
|
+
"**📊 Decision Guide:**\n",
|
|
1329
|
+
"\n",
|
|
1330
|
+
"| Finding | Linear Models | Tree-Based Models |\n",
|
|
1331
|
+
"|---------|--------------|-------------------|\n",
|
|
1332
|
+
"| Strong predictors | Include - will have high coefficients | Include - will appear early in splits |\n",
|
|
1333
|
+
"| Weak predictors | Consider dropping | May help in interactions |\n",
|
|
1334
|
+
"| Multicollinear pairs | Drop one feature | Can keep both (trees handle it) |"
|
|
1335
|
+
]
|
|
1336
|
+
},
|
|
1337
|
+
{
|
|
1338
|
+
"cell_type": "code",
|
|
1339
|
+
"execution_count": null,
|
|
1340
|
+
"id": "035df8a7",
|
|
1341
|
+
"metadata": {
|
|
1342
|
+
"execution": {
|
|
1343
|
+
"iopub.execute_input": "2026-02-02T13:03:17.367314Z",
|
|
1344
|
+
"iopub.status.busy": "2026-02-02T13:03:17.367183Z",
|
|
1345
|
+
"iopub.status.idle": "2026-02-02T13:03:17.376296Z",
|
|
1346
|
+
"shell.execute_reply": "2026-02-02T13:03:17.375909Z"
|
|
1347
|
+
},
|
|
1348
|
+
"papermill": {
|
|
1349
|
+
"duration": 0.024095,
|
|
1350
|
+
"end_time": "2026-02-02T13:03:17.376933",
|
|
1351
|
+
"exception": false,
|
|
1352
|
+
"start_time": "2026-02-02T13:03:17.352838",
|
|
1353
|
+
"status": "completed"
|
|
1354
|
+
},
|
|
1355
|
+
"tags": []
|
|
1356
|
+
},
|
|
1357
|
+
"outputs": [],
|
|
1358
|
+
"source": [
|
|
1359
|
+
"# Feature Selection Recommendations\n",
|
|
1360
|
+
"selection_recs = grouped_recs.get(RecommendationCategory.FEATURE_SELECTION, [])\n",
|
|
1361
|
+
"\n",
|
|
1362
|
+
"print(\"=\" * 70)\n",
|
|
1363
|
+
"print(\"FEATURE SELECTION\")\n",
|
|
1364
|
+
"print(\"=\" * 70)\n",
|
|
1365
|
+
"\n",
|
|
1366
|
+
"# Strong predictors summary\n",
|
|
1367
|
+
"if analysis_summary.strong_predictors:\n",
|
|
1368
|
+
" print(\"\\n✅ STRONG PREDICTORS (prioritize these):\")\n",
|
|
1369
|
+
" strong_df = pd.DataFrame(analysis_summary.strong_predictors)\n",
|
|
1370
|
+
" strong_df[\"effect_size\"] = strong_df[\"effect_size\"].apply(lambda x: f\"{x:+.3f}\")\n",
|
|
1371
|
+
" strong_df[\"correlation\"] = strong_df[\"correlation\"].apply(lambda x: f\"{x:+.3f}\")\n",
|
|
1372
|
+
" strong_df = strong_df.sort_values(\"effect_size\", key=lambda x: x.str.replace(\"+\", \"\").astype(float).abs(), ascending=False)\n",
|
|
1373
|
+
" display(strong_df)\n",
|
|
1374
|
+
" \n",
|
|
1375
|
+
" print(\"\\n 💡 These features show strong discrimination between retained/churned customers.\")\n",
|
|
1376
|
+
" print(\" → Ensure they're included in your model\")\n",
|
|
1377
|
+
" print(\" → Check for data quality issues that could inflate their importance\")\n",
|
|
1378
|
+
"\n",
|
|
1379
|
+
"# Weak predictors summary\n",
|
|
1380
|
+
"if analysis_summary.weak_predictors:\n",
|
|
1381
|
+
" print(f\"\\n⚪ WEAK PREDICTORS (consider dropping): {', '.join(analysis_summary.weak_predictors[:5])}\")\n",
|
|
1382
|
+
" print(\" → Low individual predictive power, but may help in combination\")\n",
|
|
1383
|
+
"\n",
|
|
1384
|
+
"# Multicollinearity summary\n",
|
|
1385
|
+
"if analysis_summary.multicollinear_pairs:\n",
|
|
1386
|
+
" print(\"\\n⚠️ MULTICOLLINEAR PAIRS (drop one from each pair for linear models):\")\n",
|
|
1387
|
+
" for pair in analysis_summary.multicollinear_pairs:\n",
|
|
1388
|
+
" print(f\" • {pair['feature1']} ↔ {pair['feature2']}: r = {pair['correlation']:.2f}\")\n",
|
|
1389
|
+
" print(\"\\n 💡 For each pair, keep the feature with:\")\n",
|
|
1390
|
+
" print(\" - Stronger business meaning\")\n",
|
|
1391
|
+
" print(\" - Higher target correlation\")\n",
|
|
1392
|
+
" print(\" - Fewer missing values\")\n",
|
|
1393
|
+
"\n",
|
|
1394
|
+
"# Display all feature selection recommendations\n",
|
|
1395
|
+
"if selection_recs:\n",
|
|
1396
|
+
" print(\"\\n\" + \"-\" * 70)\n",
|
|
1397
|
+
" print(\"DETAILED RECOMMENDATIONS:\")\n",
|
|
1398
|
+
" for rec in selection_recs:\n",
|
|
1399
|
+
" priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"🟢\"\n",
|
|
1400
|
+
" print(f\"\\n{priority_icon} {rec.title}\")\n",
|
|
1401
|
+
" print(f\" {rec.description}\")\n",
|
|
1402
|
+
" print(f\" → {rec.action}\")"
|
|
1403
|
+
]
|
|
1404
|
+
},
|
|
1405
|
+
{
|
|
1406
|
+
"cell_type": "markdown",
|
|
1407
|
+
"id": "79734dcf",
|
|
1408
|
+
"metadata": {
|
|
1409
|
+
"papermill": {
|
|
1410
|
+
"duration": 0.014185,
|
|
1411
|
+
"end_time": "2026-02-02T13:03:17.406448",
|
|
1412
|
+
"exception": false,
|
|
1413
|
+
"start_time": "2026-02-02T13:03:17.392263",
|
|
1414
|
+
"status": "completed"
|
|
1415
|
+
},
|
|
1416
|
+
"tags": []
|
|
1417
|
+
},
|
|
1418
|
+
"source": [
|
|
1419
|
+
"### 4.9.2 Stratification Recommendations\n",
|
|
1420
|
+
"\n",
|
|
1421
|
+
"**What these recommendations tell you:**\n",
|
|
1422
|
+
"- How to **split your data** for training and testing\n",
|
|
1423
|
+
"- Which **segments require special attention** in sampling\n",
|
|
1424
|
+
"- **High-risk segments** that need adequate representation\n",
|
|
1425
|
+
"\n",
|
|
1426
|
+
"**⚠️ Why This Matters:**\n",
|
|
1427
|
+
"- Random splits can under-represent rare segments\n",
|
|
1428
|
+
"- High-risk segments may be systematically excluded\n",
|
|
1429
|
+
"- Model evaluation will be biased without proper stratification\n",
|
|
1430
|
+
"\n",
|
|
1431
|
+
"**📊 Implementation:**\n",
|
|
1432
|
+
"```python\n",
|
|
1433
|
+
"from sklearn.model_selection import train_test_split\n",
|
|
1434
|
+
"\n",
|
|
1435
|
+
"# Stratified split by target\n",
|
|
1436
|
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
|
1437
|
+
" X, y, test_size=0.2, stratify=y, random_state=42\n",
|
|
1438
|
+
")\n",
|
|
1439
|
+
"\n",
|
|
1440
|
+
"# Multi-column stratification (for categorical segments)\n",
|
|
1441
|
+
"df['stratify_col'] = df['target'].astype(str) + '_' + df['segment']\n",
|
|
1442
|
+
"```"
|
|
1443
|
+
]
|
|
1444
|
+
},
|
|
1445
|
+
{
|
|
1446
|
+
"cell_type": "code",
|
|
1447
|
+
"execution_count": null,
|
|
1448
|
+
"id": "6de906bd",
|
|
1449
|
+
"metadata": {
|
|
1450
|
+
"execution": {
|
|
1451
|
+
"iopub.execute_input": "2026-02-02T13:03:17.437828Z",
|
|
1452
|
+
"iopub.status.busy": "2026-02-02T13:03:17.437580Z",
|
|
1453
|
+
"iopub.status.idle": "2026-02-02T13:03:17.445400Z",
|
|
1454
|
+
"shell.execute_reply": "2026-02-02T13:03:17.444741Z"
|
|
1455
|
+
},
|
|
1456
|
+
"papermill": {
|
|
1457
|
+
"duration": 0.024109,
|
|
1458
|
+
"end_time": "2026-02-02T13:03:17.446105",
|
|
1459
|
+
"exception": false,
|
|
1460
|
+
"start_time": "2026-02-02T13:03:17.421996",
|
|
1461
|
+
"status": "completed"
|
|
1462
|
+
},
|
|
1463
|
+
"tags": []
|
|
1464
|
+
},
|
|
1465
|
+
"outputs": [],
|
|
1466
|
+
"source": [
|
|
1467
|
+
"# Stratification Recommendations\n",
|
|
1468
|
+
"strat_recs = grouped_recs.get(RecommendationCategory.STRATIFICATION, [])\n",
|
|
1469
|
+
"\n",
|
|
1470
|
+
"print(\"=\" * 70)\n",
|
|
1471
|
+
"print(\"STRATIFICATION (Train/Test Split Strategy)\")\n",
|
|
1472
|
+
"print(\"=\" * 70)\n",
|
|
1473
|
+
"\n",
|
|
1474
|
+
"# High-risk segments\n",
|
|
1475
|
+
"if analysis_summary.high_risk_segments:\n",
|
|
1476
|
+
" print(\"\\n🎯 HIGH-RISK SEGMENTS (ensure representation in training data):\")\n",
|
|
1477
|
+
" risk_df = pd.DataFrame(analysis_summary.high_risk_segments)\n",
|
|
1478
|
+
" risk_df[\"retention_rate\"] = risk_df[\"retention_rate\"].apply(lambda x: f\"{x:.1%}\")\n",
|
|
1479
|
+
" risk_df[\"lift\"] = risk_df[\"lift\"].apply(lambda x: f\"{x:.2f}x\")\n",
|
|
1480
|
+
" display(risk_df[[\"feature\", \"segment\", \"count\", \"retention_rate\", \"lift\"]])\n",
|
|
1481
|
+
" \n",
|
|
1482
|
+
" print(\"\\n 💡 These segments have below-average retention.\")\n",
|
|
1483
|
+
" print(\" → Ensure they're adequately represented in both train and test sets\")\n",
|
|
1484
|
+
" print(\" → Consider oversampling or class weights in modeling\")\n",
|
|
1485
|
+
"\n",
|
|
1486
|
+
"# Display all stratification recommendations\n",
|
|
1487
|
+
"if strat_recs:\n",
|
|
1488
|
+
" print(\"\\n\" + \"-\" * 70)\n",
|
|
1489
|
+
" print(\"STRATIFICATION RECOMMENDATIONS:\")\n",
|
|
1490
|
+
" for rec in strat_recs:\n",
|
|
1491
|
+
" priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"🟢\"\n",
|
|
1492
|
+
" print(f\"\\n{priority_icon} {rec.title}\")\n",
|
|
1493
|
+
" print(f\" {rec.description}\")\n",
|
|
1494
|
+
" print(f\" → {rec.action}\")\n",
|
|
1495
|
+
"else:\n",
|
|
1496
|
+
" print(\"\\n✅ No special stratification requirements detected.\")\n",
|
|
1497
|
+
" print(\" Standard stratified split by target variable is sufficient.\")"
|
|
1498
|
+
]
|
|
1499
|
+
},
|
|
1500
|
+
{
|
|
1501
|
+
"cell_type": "markdown",
|
|
1502
|
+
"id": "02511dd8",
|
|
1503
|
+
"metadata": {
|
|
1504
|
+
"papermill": {
|
|
1505
|
+
"duration": 0.014659,
|
|
1506
|
+
"end_time": "2026-02-02T13:03:17.476454",
|
|
1507
|
+
"exception": false,
|
|
1508
|
+
"start_time": "2026-02-02T13:03:17.461795",
|
|
1509
|
+
"status": "completed"
|
|
1510
|
+
},
|
|
1511
|
+
"tags": []
|
|
1512
|
+
},
|
|
1513
|
+
"source": [
|
|
1514
|
+
"### 4.9.3 Model Selection Recommendations\n",
|
|
1515
|
+
"\n",
|
|
1516
|
+
"**What these recommendations tell you:**\n",
|
|
1517
|
+
"- Which **model types** are well-suited for your data characteristics\n",
|
|
1518
|
+
"- **Linear vs non-linear** based on relationship patterns\n",
|
|
1519
|
+
"- **Ensemble considerations** based on feature interactions\n",
|
|
1520
|
+
"\n",
|
|
1521
|
+
"**📊 Model Selection Guide Based on Data Characteristics:**\n",
|
|
1522
|
+
"\n",
|
|
1523
|
+
"| Data Characteristic | Recommended Models | Reason |\n",
|
|
1524
|
+
"|---------------------|-------------------|--------|\n",
|
|
1525
|
+
"| Strong linear relationships | Logistic Regression, Linear SVM | Interpretable, fast, less overfit risk |\n",
|
|
1526
|
+
"| Non-linear patterns | Random Forest, XGBoost, LightGBM | Capture complex interactions |\n",
|
|
1527
|
+
"| High multicollinearity | Tree-based models | Robust to correlated features |\n",
|
|
1528
|
+
"| Many categorical features | CatBoost, LightGBM | Native categorical handling |\n",
|
|
1529
|
+
"| Imbalanced classes | Any with class_weight='balanced' | Adjust for minority class |"
|
|
1530
|
+
]
|
|
1531
|
+
},
|
|
1532
|
+
{
|
|
1533
|
+
"cell_type": "code",
|
|
1534
|
+
"execution_count": null,
|
|
1535
|
+
"id": "1ece58ad",
|
|
1536
|
+
"metadata": {
|
|
1537
|
+
"execution": {
|
|
1538
|
+
"iopub.execute_input": "2026-02-02T13:03:17.506118Z",
|
|
1539
|
+
"iopub.status.busy": "2026-02-02T13:03:17.505993Z",
|
|
1540
|
+
"iopub.status.idle": "2026-02-02T13:03:17.510367Z",
|
|
1541
|
+
"shell.execute_reply": "2026-02-02T13:03:17.509634Z"
|
|
1542
|
+
},
|
|
1543
|
+
"papermill": {
|
|
1544
|
+
"duration": 0.020016,
|
|
1545
|
+
"end_time": "2026-02-02T13:03:17.510926",
|
|
1546
|
+
"exception": false,
|
|
1547
|
+
"start_time": "2026-02-02T13:03:17.490910",
|
|
1548
|
+
"status": "completed"
|
|
1549
|
+
},
|
|
1550
|
+
"tags": []
|
|
1551
|
+
},
|
|
1552
|
+
"outputs": [],
|
|
1553
|
+
"source": [
|
|
1554
|
+
"# Model Selection Recommendations\n",
|
|
1555
|
+
"model_recs = grouped_recs.get(RecommendationCategory.MODEL_SELECTION, [])\n",
|
|
1556
|
+
"\n",
|
|
1557
|
+
"print(\"=\" * 70)\n",
|
|
1558
|
+
"print(\"MODEL SELECTION\")\n",
|
|
1559
|
+
"print(\"=\" * 70)\n",
|
|
1560
|
+
"\n",
|
|
1561
|
+
"if model_recs:\n",
|
|
1562
|
+
" for rec in model_recs:\n",
|
|
1563
|
+
" priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"🟢\"\n",
|
|
1564
|
+
" print(f\"\\n{priority_icon} {rec.title}\")\n",
|
|
1565
|
+
" print(f\" {rec.description}\")\n",
|
|
1566
|
+
" print(f\" → {rec.action}\")\n",
|
|
1567
|
+
"\n",
|
|
1568
|
+
"# Summary recommendations based on data characteristics\n",
|
|
1569
|
+
"print(\"\\n\" + \"-\" * 70)\n",
|
|
1570
|
+
"print(\"RECOMMENDED MODELING APPROACH:\")\n",
|
|
1571
|
+
"\n",
|
|
1572
|
+
"has_multicollinearity = len(analysis_summary.multicollinear_pairs) > 0\n",
|
|
1573
|
+
"has_strong_linear = len([p for p in analysis_summary.strong_predictors if abs(p.get(\"effect_size\", 0)) >= 0.5]) > 0\n",
|
|
1574
|
+
"has_categoricals = len(categorical_features) > 0\n",
|
|
1575
|
+
"\n",
|
|
1576
|
+
"if has_strong_linear and not has_multicollinearity:\n",
|
|
1577
|
+
" print(\"\\n✅ RECOMMENDED: Start with Logistic Regression\")\n",
|
|
1578
|
+
" print(\" • Strong linear relationships detected\")\n",
|
|
1579
|
+
" print(\" • Interpretable coefficients for business insights\")\n",
|
|
1580
|
+
" print(\" • Fast training and inference\")\n",
|
|
1581
|
+
" print(\" • Then compare with tree-based ensemble for potential improvement\")\n",
|
|
1582
|
+
"elif has_multicollinearity:\n",
|
|
1583
|
+
" print(\"\\n✅ RECOMMENDED: Start with Random Forest or XGBoost\")\n",
|
|
1584
|
+
" print(\" • Multicollinearity present - tree models handle it naturally\")\n",
|
|
1585
|
+
" print(\" • Can keep all features without VIF analysis\")\n",
|
|
1586
|
+
" print(\" • Use feature importance to understand contributions\")\n",
|
|
1587
|
+
"else:\n",
|
|
1588
|
+
" print(\"\\n✅ RECOMMENDED: Compare Linear and Tree-Based Models\")\n",
|
|
1589
|
+
" print(\" • No clear linear dominance - test both approaches\")\n",
|
|
1590
|
+
" print(\" • Logistic Regression for interpretability baseline\")\n",
|
|
1591
|
+
" print(\" • Random Forest/XGBoost for potential accuracy gain\")\n",
|
|
1592
|
+
"\n",
|
|
1593
|
+
"if has_categoricals:\n",
|
|
1594
|
+
" print(\"\\n💡 CATEGORICAL HANDLING:\")\n",
|
|
1595
|
+
" print(\" • For tree models: Consider CatBoost or LightGBM with native categorical support\")\n",
|
|
1596
|
+
" print(\" • For linear models: Use target encoding for high-cardinality features\")"
|
|
1597
|
+
]
|
|
1598
|
+
},
|
|
1599
|
+
{
|
|
1600
|
+
"cell_type": "markdown",
|
|
1601
|
+
"id": "87554cf9",
|
|
1602
|
+
"metadata": {
|
|
1603
|
+
"papermill": {
|
|
1604
|
+
"duration": 0.015108,
|
|
1605
|
+
"end_time": "2026-02-02T13:03:17.541056",
|
|
1606
|
+
"exception": false,
|
|
1607
|
+
"start_time": "2026-02-02T13:03:17.525948",
|
|
1608
|
+
"status": "completed"
|
|
1609
|
+
},
|
|
1610
|
+
"tags": []
|
|
1611
|
+
},
|
|
1612
|
+
"source": [
|
|
1613
|
+
"### 4.9.4 Feature Engineering Recommendations\n",
|
|
1614
|
+
"\n",
|
|
1615
|
+
"**What these recommendations tell you:**\n",
|
|
1616
|
+
"- **Interaction features** to create based on correlation patterns\n",
|
|
1617
|
+
"- **Ratio features** that may capture relative relationships\n",
|
|
1618
|
+
"- **Polynomial features** for non-linear patterns\n",
|
|
1619
|
+
"\n",
|
|
1620
|
+
"**📊 Common Feature Engineering Patterns:**\n",
|
|
1621
|
+
"\n",
|
|
1622
|
+
"| Pattern Found | Feature to Create | Example |\n",
|
|
1623
|
+
"|---------------|------------------|---------|\n",
|
|
1624
|
+
"| Moderate correlation | Ratio feature | `feature_a / feature_b` |\n",
|
|
1625
|
+
"| Both features predictive | Interaction term | `feature_a * feature_b` |\n",
|
|
1626
|
+
"| Curved scatter pattern | Polynomial | `feature_a ** 2` |\n",
|
|
1627
|
+
"| Related semantics | Difference | `total_orders - returned_orders` |"
|
|
1628
|
+
]
|
|
1629
|
+
},
|
|
1630
|
+
{
|
|
1631
|
+
"cell_type": "code",
|
|
1632
|
+
"execution_count": null,
|
|
1633
|
+
"id": "26096abe",
|
|
1634
|
+
"metadata": {
|
|
1635
|
+
"execution": {
|
|
1636
|
+
"iopub.execute_input": "2026-02-02T13:03:17.571266Z",
|
|
1637
|
+
"iopub.status.busy": "2026-02-02T13:03:17.571148Z",
|
|
1638
|
+
"iopub.status.idle": "2026-02-02T13:03:17.574767Z",
|
|
1639
|
+
"shell.execute_reply": "2026-02-02T13:03:17.574171Z"
|
|
1640
|
+
},
|
|
1641
|
+
"papermill": {
|
|
1642
|
+
"duration": 0.019586,
|
|
1643
|
+
"end_time": "2026-02-02T13:03:17.575301",
|
|
1644
|
+
"exception": false,
|
|
1645
|
+
"start_time": "2026-02-02T13:03:17.555715",
|
|
1646
|
+
"status": "completed"
|
|
1647
|
+
},
|
|
1648
|
+
"tags": []
|
|
1649
|
+
},
|
|
1650
|
+
"outputs": [],
|
|
1651
|
+
"source": [
|
|
1652
|
+
"# Feature Engineering Recommendations\n",
|
|
1653
|
+
"eng_recs = grouped_recs.get(RecommendationCategory.FEATURE_ENGINEERING, [])\n",
|
|
1654
|
+
"\n",
|
|
1655
|
+
"print(\"=\" * 70)\n",
|
|
1656
|
+
"print(\"FEATURE ENGINEERING\")\n",
|
|
1657
|
+
"print(\"=\" * 70)\n",
|
|
1658
|
+
"\n",
|
|
1659
|
+
"if eng_recs:\n",
|
|
1660
|
+
" for rec in eng_recs:\n",
|
|
1661
|
+
" priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"🟢\"\n",
|
|
1662
|
+
" print(f\"\\n{priority_icon} {rec.title}\")\n",
|
|
1663
|
+
" print(f\" {rec.description}\")\n",
|
|
1664
|
+
" print(f\" → {rec.action}\")\n",
|
|
1665
|
+
" if rec.affected_features:\n",
|
|
1666
|
+
" print(f\" → Features: {', '.join(rec.affected_features[:5])}\")\n",
|
|
1667
|
+
"else:\n",
|
|
1668
|
+
" print(\"\\n✅ No specific feature engineering recommendations based on correlation patterns.\")\n",
|
|
1669
|
+
" print(\" Consider domain-specific features based on business knowledge.\")\n",
|
|
1670
|
+
"\n",
|
|
1671
|
+
"# Additional suggestions based on strong predictors\n",
|
|
1672
|
+
"if analysis_summary.strong_predictors:\n",
|
|
1673
|
+
" print(\"\\n\" + \"-\" * 70)\n",
|
|
1674
|
+
" print(\"POTENTIAL INTERACTION FEATURES:\")\n",
|
|
1675
|
+
" strong_features = [p[\"feature\"] for p in analysis_summary.strong_predictors[:5]]\n",
|
|
1676
|
+
" if len(strong_features) >= 2:\n",
|
|
1677
|
+
" print(f\"\\n Based on strong predictors, consider interactions between:\")\n",
|
|
1678
|
+
" for i, f1 in enumerate(strong_features[:3]):\n",
|
|
1679
|
+
" for f2 in strong_features[i+1:4]:\n",
|
|
1680
|
+
" print(f\" • {f1} × {f2}\")\n",
|
|
1681
|
+
" print(\"\\n 💡 Tree-based models discover interactions automatically.\")\n",
|
|
1682
|
+
" print(\" → For linear models, create explicit interaction columns.\")"
|
|
1683
|
+
]
|
|
1684
|
+
},
|
|
1685
|
+
{
|
|
1686
|
+
"cell_type": "markdown",
|
|
1687
|
+
"id": "adb99bf3",
|
|
1688
|
+
"metadata": {
|
|
1689
|
+
"papermill": {
|
|
1690
|
+
"duration": 0.014949,
|
|
1691
|
+
"end_time": "2026-02-02T13:03:17.605497",
|
|
1692
|
+
"exception": false,
|
|
1693
|
+
"start_time": "2026-02-02T13:03:17.590548",
|
|
1694
|
+
"status": "completed"
|
|
1695
|
+
},
|
|
1696
|
+
"tags": []
|
|
1697
|
+
},
|
|
1698
|
+
"source": [
|
|
1699
|
+
"### 4.9.5 Recommendations Summary Table"
|
|
1700
|
+
]
|
|
1701
|
+
},
|
|
1702
|
+
{
|
|
1703
|
+
"cell_type": "code",
|
|
1704
|
+
"execution_count": null,
|
|
1705
|
+
"id": "ccc18dbd",
|
|
1706
|
+
"metadata": {
|
|
1707
|
+
"execution": {
|
|
1708
|
+
"iopub.execute_input": "2026-02-02T13:03:17.636573Z",
|
|
1709
|
+
"iopub.status.busy": "2026-02-02T13:03:17.636412Z",
|
|
1710
|
+
"iopub.status.idle": "2026-02-02T13:03:17.798977Z",
|
|
1711
|
+
"shell.execute_reply": "2026-02-02T13:03:17.797952Z"
|
|
1712
|
+
},
|
|
1713
|
+
"papermill": {
|
|
1714
|
+
"duration": 0.18038,
|
|
1715
|
+
"end_time": "2026-02-02T13:03:17.801128",
|
|
1716
|
+
"exception": false,
|
|
1717
|
+
"start_time": "2026-02-02T13:03:17.620748",
|
|
1718
|
+
"status": "completed"
|
|
1719
|
+
},
|
|
1720
|
+
"tags": []
|
|
1721
|
+
},
|
|
1722
|
+
"outputs": [],
|
|
1723
|
+
"source": [
|
|
1724
|
+
"# Create summary table of all recommendations\n",
|
|
1725
|
+
"all_recs_data = []\n",
|
|
1726
|
+
"for rec in analysis_summary.recommendations:\n",
|
|
1727
|
+
" all_recs_data.append({\n",
|
|
1728
|
+
" \"Category\": rec.category.value.replace(\"_\", \" \").title(),\n",
|
|
1729
|
+
" \"Priority\": rec.priority.upper(),\n",
|
|
1730
|
+
" \"Recommendation\": rec.title,\n",
|
|
1731
|
+
" \"Action\": rec.action[:80] + \"...\" if len(rec.action) > 80 else rec.action\n",
|
|
1732
|
+
" })\n",
|
|
1733
|
+
"\n",
|
|
1734
|
+
"if all_recs_data:\n",
|
|
1735
|
+
" recs_df = pd.DataFrame(all_recs_data)\n",
|
|
1736
|
+
" \n",
|
|
1737
|
+
" # Sort by priority\n",
|
|
1738
|
+
" priority_order = {\"HIGH\": 0, \"MEDIUM\": 1, \"LOW\": 2}\n",
|
|
1739
|
+
" recs_df[\"_sort\"] = recs_df[\"Priority\"].map(priority_order)\n",
|
|
1740
|
+
" recs_df = recs_df.sort_values(\"_sort\").drop(\"_sort\", axis=1)\n",
|
|
1741
|
+
" \n",
|
|
1742
|
+
" print(\"=\" * 80)\n",
|
|
1743
|
+
" print(\"ALL RECOMMENDATIONS SUMMARY\")\n",
|
|
1744
|
+
" print(\"=\" * 80)\n",
|
|
1745
|
+
" print(f\"\\nTotal: {len(recs_df)} recommendations\")\n",
|
|
1746
|
+
" print(f\" 🔴 High priority: {len(recs_df[recs_df['Priority'] == 'HIGH'])}\")\n",
|
|
1747
|
+
" print(f\" 🟡 Medium priority: {len(recs_df[recs_df['Priority'] == 'MEDIUM'])}\")\n",
|
|
1748
|
+
" print(f\" 🟢 Low priority: {len(recs_df[recs_df['Priority'] == 'LOW'])}\")\n",
|
|
1749
|
+
" \n",
|
|
1750
|
+
" display(recs_df)\n",
|
|
1751
|
+
"\n",
|
|
1752
|
+
"# Save updated findings and recommendations registry\n",
|
|
1753
|
+
"findings.save(FINDINGS_PATH)\n",
|
|
1754
|
+
"registry.save(RECOMMENDATIONS_PATH)\n",
|
|
1755
|
+
"\n",
|
|
1756
|
+
"print(f\"\\n✅ Findings updated with relationship analysis: {FINDINGS_PATH}\")\n",
|
|
1757
|
+
"print(f\"✅ Recommendations registry saved: {RECOMMENDATIONS_PATH}\")\n",
|
|
1758
|
+
"print(f\" Total recommendations in registry: {len(registry.all_recommendations)}\")\n"
|
|
1759
|
+
]
|
|
1760
|
+
},
|
|
1761
|
+
{
|
|
1762
|
+
"cell_type": "markdown",
|
|
1763
|
+
"id": "7994f4af",
|
|
1764
|
+
"metadata": {
|
|
1765
|
+
"papermill": {
|
|
1766
|
+
"duration": 0.0174,
|
|
1767
|
+
"end_time": "2026-02-02T13:03:17.833645",
|
|
1768
|
+
"exception": false,
|
|
1769
|
+
"start_time": "2026-02-02T13:03:17.816245",
|
|
1770
|
+
"status": "completed"
|
|
1771
|
+
},
|
|
1772
|
+
"tags": []
|
|
1773
|
+
},
|
|
1774
|
+
"source": [
|
|
1775
|
+
"---\n",
|
|
1776
|
+
"\n",
|
|
1777
|
+
"## Summary: What We Learned\n",
|
|
1778
|
+
"\n",
|
|
1779
|
+
"In this notebook, we analyzed feature relationships and generated **actionable recommendations** for modeling.\n",
|
|
1780
|
+
"\n",
|
|
1781
|
+
"### Analysis Performed\n",
|
|
1782
|
+
"\n",
|
|
1783
|
+
"**Numeric Features:**\n",
|
|
1784
|
+
"1. **Correlation Matrix** - Identified multicollinearity issues between feature pairs\n",
|
|
1785
|
+
"2. **Effect Sizes (Cohen's d)** - Quantified how well features discriminate retained vs churned\n",
|
|
1786
|
+
"3. **Box Plots** - Visualized distribution differences between classes\n",
|
|
1787
|
+
"4. **Feature-Target Correlations** - Ranked features by predictive power\n",
|
|
1788
|
+
"\n",
|
|
1789
|
+
"**Categorical Features:**\n",
|
|
1790
|
+
"5. **Cramér's V** - Measured association strength for categorical variables\n",
|
|
1791
|
+
"6. **Retention by Category** - Identified high-risk segments\n",
|
|
1792
|
+
"7. **Lift Analysis** - Found categories performing above/below average\n",
|
|
1793
|
+
"\n",
|
|
1794
|
+
"**Datetime Features:**\n",
|
|
1795
|
+
"8. **Cohort Analysis** - Retention trends by signup year\n",
|
|
1796
|
+
"9. **Seasonality** - Monthly patterns in retention\n",
|
|
1797
|
+
"\n",
|
|
1798
|
+
"### Actionable Recommendations Generated\n",
|
|
1799
|
+
"\n",
|
|
1800
|
+
"| Category | What It Tells You | Impact on Pipeline |\n",
|
|
1801
|
+
"|----------|-------------------|-------------------|\n",
|
|
1802
|
+
"| **Feature Selection** | Which features to prioritize/drop | Reduces noise, improves interpretability |\n",
|
|
1803
|
+
"| **Stratification** | How to split train/test | Ensures fair evaluation |\n",
|
|
1804
|
+
"| **Model Selection** | Which algorithms to try first | Matches model to data |\n",
|
|
1805
|
+
"| **Feature Engineering** | Interactions to create | Captures non-linear patterns |\n",
|
|
1806
|
+
"\n",
|
|
1807
|
+
"### Key Metrics Reference\n",
|
|
1808
|
+
"\n",
|
|
1809
|
+
"| Data Type | Effect Measure | Strong Signal |\n",
|
|
1810
|
+
"|-----------|---------------|---------------|\n",
|
|
1811
|
+
"| Numeric | Cohen's d | \\|d\\| ≥ 0.8 |\n",
|
|
1812
|
+
"| Numeric | Correlation | \\|r\\| ≥ 0.5 |\n",
|
|
1813
|
+
"| Categorical | Cramér's V | V ≥ 0.3 |\n",
|
|
1814
|
+
"| Categorical | Lift | < 0.9x or > 1.1x |\n",
|
|
1815
|
+
"\n",
|
|
1816
|
+
"---\n",
|
|
1817
|
+
"\n",
|
|
1818
|
+
"## Recommended Actions Checklist\n",
|
|
1819
|
+
"\n",
|
|
1820
|
+
"Based on the analysis above, here are the key actions to take:\n",
|
|
1821
|
+
"\n",
|
|
1822
|
+
"- [ ] **Feature Selection**: Review strong/weak predictors and multicollinear pairs\n",
|
|
1823
|
+
"- [ ] **Stratification**: Use stratified sampling with identified high-risk segments\n",
|
|
1824
|
+
"- [ ] **Model Selection**: Start with recommended model type based on data characteristics\n",
|
|
1825
|
+
"- [ ] **Feature Engineering**: Create interaction features between strong predictors\n",
|
|
1826
|
+
"\n",
|
|
1827
|
+
"---\n",
|
|
1828
|
+
"\n",
|
|
1829
|
+
"## Next Steps\n",
|
|
1830
|
+
"\n",
|
|
1831
|
+
"Continue to **05_feature_opportunities.ipynb** to:\n",
|
|
1832
|
+
"- Generate derived features (tenure, recency, engagement scores)\n",
|
|
1833
|
+
"- Identify interaction features based on relationships found here\n",
|
|
1834
|
+
"- Create business-relevant composite scores\n",
|
|
1835
|
+
"- Review automated feature recommendations"
|
|
1836
|
+
]
|
|
1837
|
+
},
|
|
1838
|
+
{
|
|
1839
|
+
"cell_type": "markdown",
|
|
1840
|
+
"id": "d232b566",
|
|
1841
|
+
"metadata": {
|
|
1842
|
+
"papermill": {
|
|
1843
|
+
"duration": 0.014668,
|
|
1844
|
+
"end_time": "2026-02-02T13:03:17.865087",
|
|
1845
|
+
"exception": false,
|
|
1846
|
+
"start_time": "2026-02-02T13:03:17.850419",
|
|
1847
|
+
"status": "completed"
|
|
1848
|
+
},
|
|
1849
|
+
"tags": []
|
|
1850
|
+
},
|
|
1851
|
+
"source": [
|
|
1852
|
+
"> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
|
|
1853
|
+
"> The next notebook will automatically export this notebook's HTML documentation from the saved file."
|
|
1854
|
+
]
|
|
1855
|
+
}
|
|
1856
|
+
],
|
|
1857
|
+
"metadata": {
|
|
1858
|
+
"kernelspec": {
|
|
1859
|
+
"display_name": "Python 3",
|
|
1860
|
+
"language": "python",
|
|
1861
|
+
"name": "python3"
|
|
1862
|
+
},
|
|
1863
|
+
"language_info": {
|
|
1864
|
+
"codemirror_mode": {
|
|
1865
|
+
"name": "ipython",
|
|
1866
|
+
"version": 3
|
|
1867
|
+
},
|
|
1868
|
+
"file_extension": ".py",
|
|
1869
|
+
"mimetype": "text/x-python",
|
|
1870
|
+
"name": "python",
|
|
1871
|
+
"nbconvert_exporter": "python",
|
|
1872
|
+
"pygments_lexer": "ipython3",
|
|
1873
|
+
"version": "3.12.4"
|
|
1874
|
+
},
|
|
1875
|
+
"papermill": {
|
|
1876
|
+
"default_parameters": {},
|
|
1877
|
+
"duration": 6.814284,
|
|
1878
|
+
"end_time": "2026-02-02T13:03:20.495996",
|
|
1879
|
+
"environment_variables": {},
|
|
1880
|
+
"exception": null,
|
|
1881
|
+
"input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/04_relationship_analysis.ipynb",
|
|
1882
|
+
"output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/04_relationship_analysis.ipynb",
|
|
1883
|
+
"parameters": {},
|
|
1884
|
+
"start_time": "2026-02-02T13:03:13.681712",
|
|
1885
|
+
"version": "2.6.0"
|
|
1886
|
+
}
|
|
1887
|
+
},
|
|
1888
|
+
"nbformat": 4,
|
|
1889
|
+
"nbformat_minor": 5
|
|
1890
|
+
}
|