churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,1639 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "ca3d0652",
|
|
6
|
+
"metadata": {
|
|
7
|
+
"papermill": {
|
|
8
|
+
"duration": 0.004405,
|
|
9
|
+
"end_time": "2026-02-02T13:03:07.649756",
|
|
10
|
+
"exception": false,
|
|
11
|
+
"start_time": "2026-02-02T13:03:07.645351",
|
|
12
|
+
"status": "completed"
|
|
13
|
+
},
|
|
14
|
+
"tags": []
|
|
15
|
+
},
|
|
16
|
+
"source": [
|
|
17
|
+
"# Chapter 3: Quality Assessment\n",
|
|
18
|
+
"\n",
|
|
19
|
+
"**Purpose:** Deep dive into data quality issues with actionable remediation strategies.\n",
|
|
20
|
+
"\n",
|
|
21
|
+
"**What you'll learn:**\n",
|
|
22
|
+
"- How to analyze missing value patterns (MCAR vs MAR vs MNAR)\n",
|
|
23
|
+
"- How to detect and handle outliers using IQR method\n",
|
|
24
|
+
"- How to validate date sequences and binary fields\n",
|
|
25
|
+
"- How to implement data cleanup strategies\n",
|
|
26
|
+
"\n",
|
|
27
|
+
"**Outputs:**\n",
|
|
28
|
+
"- Missing value analysis with correlation patterns\n",
|
|
29
|
+
"- Outlier detection with visualization\n",
|
|
30
|
+
"- Date logic validation results\n",
|
|
31
|
+
"- Binary field validation\n",
|
|
32
|
+
"- Cleanup code examples ready to use\n",
|
|
33
|
+
"\n",
|
|
34
|
+
"---\n",
|
|
35
|
+
"\n",
|
|
36
|
+
"## Quality Assessment Framework\n",
|
|
37
|
+
"\n",
|
|
38
|
+
"| Issue Type | Detection Method | Common Solutions |\n",
|
|
39
|
+
"|------------|-----------------|------------------|\n",
|
|
40
|
+
"| Missing Values | Null counts, pattern analysis | Impute (mean/median/mode), drop, flag |\n",
|
|
41
|
+
"| Outliers | IQR, Z-score, isolation forest | Cap/clip, winsorize, transform, keep (if valid) |\n",
|
|
42
|
+
"| Date Logic | Sequence validation | Set placeholders to NULL, exclude invalid |\n",
|
|
43
|
+
"| Duplicates | Key uniqueness | Drop exact, keep most recent |\n",
|
|
44
|
+
"| Invalid Values | Range/domain checks | Correct, flag, exclude |"
|
|
45
|
+
]
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
"cell_type": "markdown",
|
|
49
|
+
"id": "e1f1da83",
|
|
50
|
+
"metadata": {
|
|
51
|
+
"papermill": {
|
|
52
|
+
"duration": 0.003187,
|
|
53
|
+
"end_time": "2026-02-02T13:03:07.656354",
|
|
54
|
+
"exception": false,
|
|
55
|
+
"start_time": "2026-02-02T13:03:07.653167",
|
|
56
|
+
"status": "completed"
|
|
57
|
+
},
|
|
58
|
+
"tags": []
|
|
59
|
+
},
|
|
60
|
+
"source": [
|
|
61
|
+
"## 3.1 Setup"
|
|
62
|
+
]
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
"cell_type": "code",
|
|
66
|
+
"execution_count": null,
|
|
67
|
+
"id": "535ea851",
|
|
68
|
+
"metadata": {
|
|
69
|
+
"execution": {
|
|
70
|
+
"iopub.execute_input": "2026-02-02T13:03:07.663333Z",
|
|
71
|
+
"iopub.status.busy": "2026-02-02T13:03:07.663186Z",
|
|
72
|
+
"iopub.status.idle": "2026-02-02T13:03:09.560231Z",
|
|
73
|
+
"shell.execute_reply": "2026-02-02T13:03:09.559246Z"
|
|
74
|
+
},
|
|
75
|
+
"papermill": {
|
|
76
|
+
"duration": 1.901811,
|
|
77
|
+
"end_time": "2026-02-02T13:03:09.561381",
|
|
78
|
+
"exception": false,
|
|
79
|
+
"start_time": "2026-02-02T13:03:07.659570",
|
|
80
|
+
"status": "completed"
|
|
81
|
+
},
|
|
82
|
+
"tags": []
|
|
83
|
+
},
|
|
84
|
+
"outputs": [],
|
|
85
|
+
"source": [
|
|
86
|
+
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
87
|
+
"track_and_export_previous(\"03_quality_assessment.ipynb\")\n",
|
|
88
|
+
"\n",
|
|
89
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationRegistry\n",
|
|
90
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
91
|
+
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
92
|
+
"import pandas as pd\n",
|
|
93
|
+
"import numpy as np\n",
|
|
94
|
+
"import plotly.graph_objects as go\n",
|
|
95
|
+
"import plotly.express as px\n",
|
|
96
|
+
"from plotly.subplots import make_subplots\n",
|
|
97
|
+
"from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure"
|
|
98
|
+
]
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
"cell_type": "code",
|
|
102
|
+
"execution_count": null,
|
|
103
|
+
"id": "cc3eca88",
|
|
104
|
+
"metadata": {
|
|
105
|
+
"execution": {
|
|
106
|
+
"iopub.execute_input": "2026-02-02T13:03:09.567940Z",
|
|
107
|
+
"iopub.status.busy": "2026-02-02T13:03:09.567768Z",
|
|
108
|
+
"iopub.status.idle": "2026-02-02T13:03:09.911965Z",
|
|
109
|
+
"shell.execute_reply": "2026-02-02T13:03:09.910742Z"
|
|
110
|
+
},
|
|
111
|
+
"papermill": {
|
|
112
|
+
"duration": 0.349377,
|
|
113
|
+
"end_time": "2026-02-02T13:03:09.913686",
|
|
114
|
+
"exception": false,
|
|
115
|
+
"start_time": "2026-02-02T13:03:09.564309",
|
|
116
|
+
"status": "completed"
|
|
117
|
+
},
|
|
118
|
+
"tags": []
|
|
119
|
+
},
|
|
120
|
+
"outputs": [],
|
|
121
|
+
"source": [
|
|
122
|
+
"# === CONFIGURATION ===\n",
|
|
123
|
+
"# Option 1: Set the exact path from notebook 01 output\n",
|
|
124
|
+
"# FINDINGS_PATH = \"../experiments/findings/customer_retention_retail_abc123_findings.yaml\"\n",
|
|
125
|
+
"\n",
|
|
126
|
+
"# Option 2: Auto-discover the most recent findings file\n",
|
|
127
|
+
"from pathlib import Path\n",
|
|
128
|
+
"import yaml\n",
|
|
129
|
+
"\n",
|
|
130
|
+
"# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
|
|
131
|
+
"\n",
|
|
132
|
+
"findings_files = [f for f in FINDINGS_DIR.glob(\"*_findings.yaml\") if \"multi_dataset\" not in f.name]\n",
|
|
133
|
+
"if not findings_files:\n",
|
|
134
|
+
" raise FileNotFoundError(f\"No findings files found in {FINDINGS_DIR}. Run notebook 01 first.\")\n",
|
|
135
|
+
"\n",
|
|
136
|
+
"# Prefer aggregated findings (from 01d) over event-level findings\n",
|
|
137
|
+
"# Pattern: *_aggregated* in filename indicates aggregated data\n",
|
|
138
|
+
"aggregated_files = [f for f in findings_files if \"_aggregated\" in f.name]\n",
|
|
139
|
+
"non_aggregated_files = [f for f in findings_files if \"_aggregated\" not in f.name]\n",
|
|
140
|
+
"\n",
|
|
141
|
+
"if aggregated_files:\n",
|
|
142
|
+
" # Use most recent aggregated file\n",
|
|
143
|
+
" aggregated_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
144
|
+
" FINDINGS_PATH = str(aggregated_files[0])\n",
|
|
145
|
+
" print(f\"Found {len(aggregated_files)} aggregated findings file(s)\")\n",
|
|
146
|
+
" print(f\"Using: {FINDINGS_PATH}\")\n",
|
|
147
|
+
" if non_aggregated_files:\n",
|
|
148
|
+
" print(f\" (Skipping {len(non_aggregated_files)} event-level findings)\")\n",
|
|
149
|
+
"else:\n",
|
|
150
|
+
" # Fall back to most recent non-aggregated file\n",
|
|
151
|
+
" non_aggregated_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
152
|
+
" FINDINGS_PATH = str(non_aggregated_files[0])\n",
|
|
153
|
+
" print(f\"Found {len(findings_files)} findings file(s)\")\n",
|
|
154
|
+
" print(f\"Using: {FINDINGS_PATH}\")\n",
|
|
155
|
+
"\n",
|
|
156
|
+
"findings = ExplorationFindings.load(FINDINGS_PATH)\n",
|
|
157
|
+
"\n",
|
|
158
|
+
"# Load data - handle aggregated vs standard paths\n",
|
|
159
|
+
"from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
|
|
160
|
+
"\n",
|
|
161
|
+
"# For aggregated data, load directly from the parquet source\n",
|
|
162
|
+
"if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
|
|
163
|
+
" source_path = Path(findings.source_path)\n",
|
|
164
|
+
" # Handle relative path from notebook directory\n",
|
|
165
|
+
" if not source_path.is_absolute():\n",
|
|
166
|
+
" # The source_path in findings is relative to project root\n",
|
|
167
|
+
" if str(source_path).startswith(\"experiments\"):\n",
|
|
168
|
+
" source_path = Path(\"..\") / source_path\n",
|
|
169
|
+
" else:\n",
|
|
170
|
+
" source_path = FINDINGS_DIR / source_path.name\n",
|
|
171
|
+
" df = pd.read_parquet(source_path)\n",
|
|
172
|
+
" data_source = f\"aggregated:{source_path.name}\"\n",
|
|
173
|
+
"else:\n",
|
|
174
|
+
" # Standard loading for event-level or entity-level data\n",
|
|
175
|
+
" df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
|
|
176
|
+
"\n",
|
|
177
|
+
"print(f\"Loaded data from: {data_source}\")\n",
|
|
178
|
+
"print(f\"Shape: {df.shape}\")\n",
|
|
179
|
+
"\n",
|
|
180
|
+
"charts = ChartBuilder()\n",
|
|
181
|
+
"\n",
|
|
182
|
+
"# Load or initialize recommendation registry\n",
|
|
183
|
+
"RECOMMENDATIONS_PATH = FINDINGS_PATH.replace(\"_findings.yaml\", \"_recommendations.yaml\")\n",
|
|
184
|
+
"if Path(RECOMMENDATIONS_PATH).exists():\n",
|
|
185
|
+
" with open(RECOMMENDATIONS_PATH, \"r\") as f:\n",
|
|
186
|
+
" registry = RecommendationRegistry.from_dict(yaml.safe_load(f))\n",
|
|
187
|
+
" print(f\"Loaded existing recommendations from: {RECOMMENDATIONS_PATH}\")\n",
|
|
188
|
+
"else:\n",
|
|
189
|
+
" registry = RecommendationRegistry()\n",
|
|
190
|
+
" registry.init_bronze(findings.source_path)\n",
|
|
191
|
+
" if findings.target_column:\n",
|
|
192
|
+
" registry.init_gold(findings.target_column)\n",
|
|
193
|
+
" entity_col = next((name for name, col in findings.columns.items() if col.inferred_type == ColumnType.IDENTIFIER), None)\n",
|
|
194
|
+
" if entity_col:\n",
|
|
195
|
+
" registry.init_silver(entity_col)\n",
|
|
196
|
+
" print(f\"Initialized new recommendation registry\")\n",
|
|
197
|
+
"\n",
|
|
198
|
+
"print(f\"\\nLoaded findings for {findings.column_count} columns\")"
|
|
199
|
+
]
|
|
200
|
+
},
|
|
201
|
+
{
|
|
202
|
+
"cell_type": "markdown",
|
|
203
|
+
"id": "59b3ad0b",
|
|
204
|
+
"metadata": {
|
|
205
|
+
"papermill": {
|
|
206
|
+
"duration": 0.013553,
|
|
207
|
+
"end_time": "2026-02-02T13:03:09.939162",
|
|
208
|
+
"exception": false,
|
|
209
|
+
"start_time": "2026-02-02T13:03:09.925609",
|
|
210
|
+
"status": "completed"
|
|
211
|
+
},
|
|
212
|
+
"tags": []
|
|
213
|
+
},
|
|
214
|
+
"source": [
|
|
215
|
+
"## 3.2 Duplicate Analysis\n",
|
|
216
|
+
"\n",
|
|
217
|
+
"**📖 Why This Matters:**\n",
|
|
218
|
+
"- Duplicate records can skew statistics and model training\n",
|
|
219
|
+
"- Key column duplicates may indicate data quality issues or event-level data\n",
|
|
220
|
+
"- Value conflicts (same key, different values) require investigation\n",
|
|
221
|
+
"\n",
|
|
222
|
+
"**What to Watch For:**\n",
|
|
223
|
+
"- **Exact duplicates**: Identical rows that should be deduplicated\n",
|
|
224
|
+
"- **Key duplicates**: Same identifier with different values (may indicate updates or errors)\n",
|
|
225
|
+
"- **Value conflicts**: Same key with conflicting values in important columns"
|
|
226
|
+
]
|
|
227
|
+
},
|
|
228
|
+
{
|
|
229
|
+
"cell_type": "code",
|
|
230
|
+
"execution_count": null,
|
|
231
|
+
"id": "4f9cfd2a",
|
|
232
|
+
"metadata": {
|
|
233
|
+
"execution": {
|
|
234
|
+
"iopub.execute_input": "2026-02-02T13:03:09.946458Z",
|
|
235
|
+
"iopub.status.busy": "2026-02-02T13:03:09.946297Z",
|
|
236
|
+
"iopub.status.idle": "2026-02-02T13:03:09.974018Z",
|
|
237
|
+
"shell.execute_reply": "2026-02-02T13:03:09.973191Z"
|
|
238
|
+
},
|
|
239
|
+
"papermill": {
|
|
240
|
+
"duration": 0.032066,
|
|
241
|
+
"end_time": "2026-02-02T13:03:09.974707",
|
|
242
|
+
"exception": false,
|
|
243
|
+
"start_time": "2026-02-02T13:03:09.942641",
|
|
244
|
+
"status": "completed"
|
|
245
|
+
},
|
|
246
|
+
"tags": []
|
|
247
|
+
},
|
|
248
|
+
"outputs": [],
|
|
249
|
+
"source": [
|
|
250
|
+
"from customer_retention.stages.validation import DataValidator\n",
|
|
251
|
+
"\n",
|
|
252
|
+
"validator = DataValidator()\n",
|
|
253
|
+
"\n",
|
|
254
|
+
"# Auto-detect potential key columns\n",
|
|
255
|
+
"potential_keys = [name for name, col in findings.columns.items() \n",
|
|
256
|
+
" if col.inferred_type.value in ('identifier', 'id') or 'id' in name.lower()]\n",
|
|
257
|
+
"KEY_COLUMN = potential_keys[0] if potential_keys else None\n",
|
|
258
|
+
"\n",
|
|
259
|
+
"print(\"=\" * 60)\n",
|
|
260
|
+
"print(\"DUPLICATE ANALYSIS\")\n",
|
|
261
|
+
"print(\"=\" * 60)\n",
|
|
262
|
+
"\n",
|
|
263
|
+
"if KEY_COLUMN:\n",
|
|
264
|
+
" dup_result = validator.check_duplicates(df, key_column=KEY_COLUMN, check_value_conflicts=True)\n",
|
|
265
|
+
" print(f\"\\nKey Column: {KEY_COLUMN}\")\n",
|
|
266
|
+
" print(f\"Total Rows: {dup_result.total_rows:,}\")\n",
|
|
267
|
+
" print(f\"Unique Keys: {dup_result.unique_keys:,}\")\n",
|
|
268
|
+
" print(f\"Duplicate Keys: {dup_result.duplicate_keys:,} ({dup_result.duplicate_percentage:.2f}%)\")\n",
|
|
269
|
+
" \n",
|
|
270
|
+
" # Exact duplicates\n",
|
|
271
|
+
" if dup_result.exact_duplicate_rows > 0:\n",
|
|
272
|
+
" print(f\"\\n⚠️ Exact duplicate rows: {dup_result.exact_duplicate_rows:,}\")\n",
|
|
273
|
+
" dup_mask = df.duplicated(keep=False)\n",
|
|
274
|
+
" dup_examples = df[dup_mask].head(6)\n",
|
|
275
|
+
" if len(dup_examples) > 0:\n",
|
|
276
|
+
" print(\"\\nExample duplicate rows:\")\n",
|
|
277
|
+
" display(dup_examples)\n",
|
|
278
|
+
" \n",
|
|
279
|
+
" # Add deduplication recommendation for exact duplicates\n",
|
|
280
|
+
" registry.add_bronze_deduplication(\n",
|
|
281
|
+
" key_column=KEY_COLUMN, strategy=\"drop_exact_duplicates\",\n",
|
|
282
|
+
" rationale=f\"{dup_result.exact_duplicate_rows} exact duplicate rows detected\",\n",
|
|
283
|
+
" source_notebook=\"03_quality_assessment\"\n",
|
|
284
|
+
" )\n",
|
|
285
|
+
" else:\n",
|
|
286
|
+
" print(\"\\n✓ No exact duplicate rows\")\n",
|
|
287
|
+
" \n",
|
|
288
|
+
" # Value conflicts\n",
|
|
289
|
+
" if dup_result.has_value_conflicts:\n",
|
|
290
|
+
" print(f\"\\n⚠️ Value conflicts detected in: {', '.join(dup_result.conflict_columns[:5])}\")\n",
|
|
291
|
+
" if findings.target_column and findings.target_column in dup_result.conflict_columns:\n",
|
|
292
|
+
" print(f\" 🔴 CRITICAL: Target '{findings.target_column}' has conflicting values!\")\n",
|
|
293
|
+
" \n",
|
|
294
|
+
" # Show examples of conflicting records\n",
|
|
295
|
+
" key_counts = df[KEY_COLUMN].value_counts()\n",
|
|
296
|
+
" dup_keys = key_counts[key_counts > 1].head(3).index.tolist()\n",
|
|
297
|
+
" if dup_keys:\n",
|
|
298
|
+
" print(\"\\nExample records with duplicate keys:\")\n",
|
|
299
|
+
" conflict_examples = df[df[KEY_COLUMN].isin(dup_keys)].sort_values(KEY_COLUMN).head(10)\n",
|
|
300
|
+
" display(conflict_examples)\n",
|
|
301
|
+
" \n",
|
|
302
|
+
" # Add deduplication recommendation for value conflicts\n",
|
|
303
|
+
" registry.add_bronze_deduplication(\n",
|
|
304
|
+
" key_column=KEY_COLUMN, strategy=\"keep_first\",\n",
|
|
305
|
+
" rationale=f\"Value conflicts in {len(dup_result.conflict_columns)} columns\",\n",
|
|
306
|
+
" source_notebook=\"03_quality_assessment\",\n",
|
|
307
|
+
" conflict_columns=dup_result.conflict_columns[:5]\n",
|
|
308
|
+
" )\n",
|
|
309
|
+
" else:\n",
|
|
310
|
+
" print(\"\\n✓ No value conflicts\")\n",
|
|
311
|
+
" \n",
|
|
312
|
+
" # Duplicate frequency distribution\n",
|
|
313
|
+
" if dup_result.duplicate_keys > 0:\n",
|
|
314
|
+
" key_counts = df[KEY_COLUMN].value_counts()\n",
|
|
315
|
+
" dup_distribution = key_counts[key_counts > 1].value_counts().sort_index()\n",
|
|
316
|
+
" if len(dup_distribution) > 0:\n",
|
|
317
|
+
" print(\"\\nDuplicate frequency distribution:\")\n",
|
|
318
|
+
" for count, num_keys in dup_distribution.head(5).items():\n",
|
|
319
|
+
" print(f\" Keys appearing {count}x: {num_keys:,}\")\n",
|
|
320
|
+
" \n",
|
|
321
|
+
" # Recommendations\n",
|
|
322
|
+
" print(\"\\n💡 RECOMMENDATIONS:\")\n",
|
|
323
|
+
" if dup_result.exact_duplicate_rows > 0:\n",
|
|
324
|
+
" print(\" • Remove exact duplicates: df.drop_duplicates()\")\n",
|
|
325
|
+
" if dup_result.has_value_conflicts:\n",
|
|
326
|
+
" print(\" • For value conflicts, decide strategy:\")\n",
|
|
327
|
+
" print(\" - Keep most recent (if you have a timestamp)\")\n",
|
|
328
|
+
" print(\" - Keep first occurrence: df.drop_duplicates(subset=[KEY_COLUMN], keep='first')\")\n",
|
|
329
|
+
" print(\" - Aggregate values (for numeric columns)\")\n",
|
|
330
|
+
"else:\n",
|
|
331
|
+
" print(\"\\n⚠️ No key column detected.\")\n",
|
|
332
|
+
" print(\" Set KEY_COLUMN above to enable duplicate analysis.\")\n",
|
|
333
|
+
" print(f\" Available columns: {list(findings.columns.keys())[:10]}...\")"
|
|
334
|
+
]
|
|
335
|
+
},
|
|
336
|
+
{
|
|
337
|
+
"cell_type": "markdown",
|
|
338
|
+
"id": "4d721d97",
|
|
339
|
+
"metadata": {
|
|
340
|
+
"papermill": {
|
|
341
|
+
"duration": 0.002453,
|
|
342
|
+
"end_time": "2026-02-02T13:03:09.979923",
|
|
343
|
+
"exception": false,
|
|
344
|
+
"start_time": "2026-02-02T13:03:09.977470",
|
|
345
|
+
"status": "completed"
|
|
346
|
+
},
|
|
347
|
+
"tags": []
|
|
348
|
+
},
|
|
349
|
+
"source": [
|
|
350
|
+
"## 3.3 Overall Quality Score"
|
|
351
|
+
]
|
|
352
|
+
},
|
|
353
|
+
{
|
|
354
|
+
"cell_type": "code",
|
|
355
|
+
"execution_count": null,
|
|
356
|
+
"id": "07cf69d3",
|
|
357
|
+
"metadata": {
|
|
358
|
+
"execution": {
|
|
359
|
+
"iopub.execute_input": "2026-02-02T13:03:09.985987Z",
|
|
360
|
+
"iopub.status.busy": "2026-02-02T13:03:09.985856Z",
|
|
361
|
+
"iopub.status.idle": "2026-02-02T13:03:09.988654Z",
|
|
362
|
+
"shell.execute_reply": "2026-02-02T13:03:09.988009Z"
|
|
363
|
+
},
|
|
364
|
+
"papermill": {
|
|
365
|
+
"duration": 0.006462,
|
|
366
|
+
"end_time": "2026-02-02T13:03:09.989094",
|
|
367
|
+
"exception": false,
|
|
368
|
+
"start_time": "2026-02-02T13:03:09.982632",
|
|
369
|
+
"status": "completed"
|
|
370
|
+
},
|
|
371
|
+
"tags": []
|
|
372
|
+
},
|
|
373
|
+
"outputs": [],
|
|
374
|
+
"source": [
|
|
375
|
+
"print(f\"Overall Quality Score: {findings.overall_quality_score:.1f}/100\")\n",
|
|
376
|
+
"\n",
|
|
377
|
+
"if findings.overall_quality_score >= 90:\n",
|
|
378
|
+
" print(\"Excellent: Data is high quality and ready for modeling.\")\n",
|
|
379
|
+
"elif findings.overall_quality_score >= 70:\n",
|
|
380
|
+
" print(\"Good: Minor quality issues that should be addressed.\")\n",
|
|
381
|
+
"elif findings.overall_quality_score >= 50:\n",
|
|
382
|
+
" print(\"Fair: Significant quality issues require attention.\")\n",
|
|
383
|
+
"else:\n",
|
|
384
|
+
" print(\"Poor: Major quality issues must be resolved before modeling.\")"
|
|
385
|
+
]
|
|
386
|
+
},
|
|
387
|
+
{
|
|
388
|
+
"cell_type": "markdown",
|
|
389
|
+
"id": "09263313",
|
|
390
|
+
"metadata": {
|
|
391
|
+
"papermill": {
|
|
392
|
+
"duration": 0.002372,
|
|
393
|
+
"end_time": "2026-02-02T13:03:09.994362",
|
|
394
|
+
"exception": false,
|
|
395
|
+
"start_time": "2026-02-02T13:03:09.991990",
|
|
396
|
+
"status": "completed"
|
|
397
|
+
},
|
|
398
|
+
"tags": []
|
|
399
|
+
},
|
|
400
|
+
"source": [
|
|
401
|
+
"## 3.4 Target Variable Analysis\n",
|
|
402
|
+
"\n",
|
|
403
|
+
"Understanding target distribution is critical for:\n",
|
|
404
|
+
"- **Class imbalance** affects model training and evaluation metrics\n",
|
|
405
|
+
"- **Business context** helps interpret what we're trying to predict\n",
|
|
406
|
+
"- **Sampling strategies** depend on imbalance severity"
|
|
407
|
+
]
|
|
408
|
+
},
|
|
409
|
+
{
|
|
410
|
+
"cell_type": "code",
|
|
411
|
+
"execution_count": null,
|
|
412
|
+
"id": "8d56ad81",
|
|
413
|
+
"metadata": {
|
|
414
|
+
"execution": {
|
|
415
|
+
"iopub.execute_input": "2026-02-02T13:03:10.000690Z",
|
|
416
|
+
"iopub.status.busy": "2026-02-02T13:03:10.000575Z",
|
|
417
|
+
"iopub.status.idle": "2026-02-02T13:03:10.039386Z",
|
|
418
|
+
"shell.execute_reply": "2026-02-02T13:03:10.038940Z"
|
|
419
|
+
},
|
|
420
|
+
"papermill": {
|
|
421
|
+
"duration": 0.042715,
|
|
422
|
+
"end_time": "2026-02-02T13:03:10.039982",
|
|
423
|
+
"exception": false,
|
|
424
|
+
"start_time": "2026-02-02T13:03:09.997267",
|
|
425
|
+
"status": "completed"
|
|
426
|
+
},
|
|
427
|
+
"tags": []
|
|
428
|
+
},
|
|
429
|
+
"outputs": [],
|
|
430
|
+
"source": [
|
|
431
|
+
"print(\"=\" * 60)\n",
|
|
432
|
+
"print(f\"TARGET VARIABLE DISTRIBUTION: {findings.target_column}\")\n",
|
|
433
|
+
"print(\"=\" * 60)\n",
|
|
434
|
+
"\n",
|
|
435
|
+
"if findings.target_column and findings.target_column in df.columns:\n",
|
|
436
|
+
" target_series = df[findings.target_column]\n",
|
|
437
|
+
" target_counts = target_series.value_counts().sort_index()\n",
|
|
438
|
+
" \n",
|
|
439
|
+
" # Create distribution table\n",
|
|
440
|
+
" dist_data = []\n",
|
|
441
|
+
" for val, count in target_counts.items():\n",
|
|
442
|
+
" pct = count / len(df) * 100\n",
|
|
443
|
+
" dist_data.append({\n",
|
|
444
|
+
" findings.target_column: val,\n",
|
|
445
|
+
" \"count\": count,\n",
|
|
446
|
+
" \"percentage\": f\"{pct:.3f}\"\n",
|
|
447
|
+
" })\n",
|
|
448
|
+
" \n",
|
|
449
|
+
" dist_df = pd.DataFrame(dist_data)\n",
|
|
450
|
+
" display(dist_df)\n",
|
|
451
|
+
" \n",
|
|
452
|
+
" # Calculate imbalance metrics\n",
|
|
453
|
+
" if len(target_counts) == 2:\n",
|
|
454
|
+
" majority = target_counts.max()\n",
|
|
455
|
+
" minority = target_counts.min()\n",
|
|
456
|
+
" minority_class = target_counts.idxmin()\n",
|
|
457
|
+
" imbalance_ratio = majority / minority\n",
|
|
458
|
+
" retention_rate = target_counts.get(1, 0) / len(df) * 100\n",
|
|
459
|
+
" \n",
|
|
460
|
+
" print(f\"\\nImbalance ratio: {imbalance_ratio:.2f}:1 (minority class: {minority_class})\")\n",
|
|
461
|
+
" print(f\"Retention rate: {retention_rate:.1f}%\")\n",
|
|
462
|
+
" \n",
|
|
463
|
+
" # Business context\n",
|
|
464
|
+
" if retention_rate > 70:\n",
|
|
465
|
+
" print(f\"\\n📊 Business Context: {retention_rate:.0f}% retention is healthy!\")\n",
|
|
466
|
+
" print(\" Churned customers are the minority class we want to predict.\")\n",
|
|
467
|
+
" elif retention_rate > 50:\n",
|
|
468
|
+
" print(f\"\\n📊 Business Context: {retention_rate:.0f}% retention is moderate.\")\n",
|
|
469
|
+
" print(\" Balanced focus on both retention and churn prediction.\")\n",
|
|
470
|
+
" else:\n",
|
|
471
|
+
" print(f\"\\n⚠️ Business Context: {retention_rate:.0f}% retention is concerning!\")\n",
|
|
472
|
+
" print(\" High churn rate requires urgent attention.\")\n",
|
|
473
|
+
" \n",
|
|
474
|
+
" # Modeling recommendations based on imbalance\n",
|
|
475
|
+
" print(\"\\n⚠️ Class imbalance considerations for modeling:\")\n",
|
|
476
|
+
" print(\" - Use stratified sampling for train/test splits\")\n",
|
|
477
|
+
" print(\" - Consider class weights in model training\")\n",
|
|
478
|
+
" print(\" - Evaluate with Precision-Recall AUC (not just ROC-AUC)\")\n",
|
|
479
|
+
" print(\" - Focus on recall for churned class (catch at-risk customers)\")\n",
|
|
480
|
+
" \n",
|
|
481
|
+
" # Add imbalance strategy recommendation\n",
|
|
482
|
+
" if imbalance_ratio < 3:\n",
|
|
483
|
+
" strategy = \"stratified_sampling\"\n",
|
|
484
|
+
" rationale = f\"Mild imbalance ({imbalance_ratio:.2f}:1) - stratified sampling sufficient\"\n",
|
|
485
|
+
" print(\" - SMOTE not needed (imbalance is mild)\")\n",
|
|
486
|
+
" elif imbalance_ratio < 5:\n",
|
|
487
|
+
" strategy = \"class_weights\"\n",
|
|
488
|
+
" rationale = f\"Moderate imbalance ({imbalance_ratio:.2f}:1) - use class weights\"\n",
|
|
489
|
+
" print(\" - SMOTE may not be necessary (imbalance is moderate)\")\n",
|
|
490
|
+
" else:\n",
|
|
491
|
+
" strategy = \"smote\"\n",
|
|
492
|
+
" rationale = f\"Severe imbalance ({imbalance_ratio:.2f}:1) - consider SMOTE\"\n",
|
|
493
|
+
" print(\" - Consider SMOTE or undersampling (imbalance is severe)\")\n",
|
|
494
|
+
" \n",
|
|
495
|
+
" registry.add_bronze_imbalance_strategy(\n",
|
|
496
|
+
" target_column=findings.target_column,\n",
|
|
497
|
+
" imbalance_ratio=imbalance_ratio,\n",
|
|
498
|
+
" minority_class=minority_class,\n",
|
|
499
|
+
" strategy=strategy,\n",
|
|
500
|
+
" rationale=rationale,\n",
|
|
501
|
+
" source_notebook=\"03_quality_assessment\"\n",
|
|
502
|
+
" )\n",
|
|
503
|
+
" \n",
|
|
504
|
+
" # Visualization\n",
|
|
505
|
+
" fig = make_subplots(rows=1, cols=2, specs=[[{\"type\": \"pie\"}, {\"type\": \"bar\"}]],\n",
|
|
506
|
+
" subplot_titles=[\"Class Distribution\", \"Count Comparison\"])\n",
|
|
507
|
+
" \n",
|
|
508
|
+
" labels = [f\"{'Retained' if v == 1 else 'Churned'} ({v})\" for v in target_counts.index]\n",
|
|
509
|
+
" fig.add_trace(go.Pie(labels=labels, values=target_counts.values, hole=0.4,\n",
|
|
510
|
+
" marker_colors=[\"#2ecc71\", \"#e74c3c\"]), row=1, col=1)\n",
|
|
511
|
+
" fig.add_trace(go.Bar(x=labels, y=target_counts.values,\n",
|
|
512
|
+
" marker_color=[\"#e74c3c\", \"#2ecc71\"]), row=1, col=2)\n",
|
|
513
|
+
" \n",
|
|
514
|
+
" fig.update_layout(height=350, title_text=\"Target Variable Distribution\",\n",
|
|
515
|
+
" showlegend=False, template=\"plotly_white\")\n",
|
|
516
|
+
" display_figure(fig)\n",
|
|
517
|
+
" else:\n",
|
|
518
|
+
" print(f\"\\nMulticlass target with {len(target_counts)} classes\")\n",
|
|
519
|
+
" \n",
|
|
520
|
+
" fig = go.Figure(go.Bar(x=[str(v) for v in target_counts.index], y=target_counts.values,\n",
|
|
521
|
+
" marker_color=px.colors.qualitative.Set2[:len(target_counts)]))\n",
|
|
522
|
+
" fig.update_layout(height=350, title_text=\"Target Variable Distribution\",\n",
|
|
523
|
+
" xaxis_title=findings.target_column, yaxis_title=\"Count\",\n",
|
|
524
|
+
" template=\"plotly_white\")\n",
|
|
525
|
+
" display_figure(fig)\n",
|
|
526
|
+
"else:\n",
|
|
527
|
+
" print(\"\\n⚠️ No target column detected or specified.\")\n",
|
|
528
|
+
" print(\" Set target_hint parameter in DataExplorer.explore() or\")\n",
|
|
529
|
+
" print(\" manually specify in findings.target_column\")"
|
|
530
|
+
]
|
|
531
|
+
},
|
|
532
|
+
{
|
|
533
|
+
"cell_type": "markdown",
|
|
534
|
+
"id": "baab5f1d",
|
|
535
|
+
"metadata": {
|
|
536
|
+
"papermill": {
|
|
537
|
+
"duration": 0.004319,
|
|
538
|
+
"end_time": "2026-02-02T13:03:10.048550",
|
|
539
|
+
"exception": false,
|
|
540
|
+
"start_time": "2026-02-02T13:03:10.044231",
|
|
541
|
+
"status": "completed"
|
|
542
|
+
},
|
|
543
|
+
"tags": []
|
|
544
|
+
},
|
|
545
|
+
"source": [
|
|
546
|
+
"## 3.5 Missing Value Analysis\n",
|
|
547
|
+
"\n",
|
|
548
|
+
"**📖 Interpretation Guide:**\n",
|
|
549
|
+
"- **MCAR (Missing Completely at Random)**: Missing values have no pattern - safe to impute with mean/median\n",
|
|
550
|
+
"- **MAR (Missing at Random)**: Missingness depends on other observed variables - use regression imputation\n",
|
|
551
|
+
"- **MNAR (Missing Not at Random)**: Missingness depends on the missing value itself - create missing indicator\n",
|
|
552
|
+
"\n",
|
|
553
|
+
"**⚠️ What to Watch For:**\n",
|
|
554
|
+
"- Columns with >50% missing may need to be dropped\n",
|
|
555
|
+
"- Highly correlated missing patterns suggest MAR\n",
|
|
556
|
+
"- ID columns with missing values indicate data integrity issues"
|
|
557
|
+
]
|
|
558
|
+
},
|
|
559
|
+
{
|
|
560
|
+
"cell_type": "code",
|
|
561
|
+
"execution_count": null,
|
|
562
|
+
"id": "ee46fd0c",
|
|
563
|
+
"metadata": {
|
|
564
|
+
"execution": {
|
|
565
|
+
"iopub.execute_input": "2026-02-02T13:03:10.058874Z",
|
|
566
|
+
"iopub.status.busy": "2026-02-02T13:03:10.058751Z",
|
|
567
|
+
"iopub.status.idle": "2026-02-02T13:03:10.072668Z",
|
|
568
|
+
"shell.execute_reply": "2026-02-02T13:03:10.072279Z"
|
|
569
|
+
},
|
|
570
|
+
"papermill": {
|
|
571
|
+
"duration": 0.019481,
|
|
572
|
+
"end_time": "2026-02-02T13:03:10.073214",
|
|
573
|
+
"exception": false,
|
|
574
|
+
"start_time": "2026-02-02T13:03:10.053733",
|
|
575
|
+
"status": "completed"
|
|
576
|
+
},
|
|
577
|
+
"tags": []
|
|
578
|
+
},
|
|
579
|
+
"outputs": [],
|
|
580
|
+
"source": [
|
|
581
|
+
"missing_data = []\n",
|
|
582
|
+
"for col_name, col_info in findings.columns.items():\n",
|
|
583
|
+
" null_count = col_info.universal_metrics.get(\"null_count\", 0)\n",
|
|
584
|
+
" null_pct = col_info.universal_metrics.get(\"null_percentage\", 0)\n",
|
|
585
|
+
" if null_count > 0:\n",
|
|
586
|
+
" missing_data.append({\n",
|
|
587
|
+
" \"Column\": col_name,\n",
|
|
588
|
+
" \"Missing Count\": null_count,\n",
|
|
589
|
+
" \"Missing %\": f\"{null_pct:.2f}%\"\n",
|
|
590
|
+
" })\n",
|
|
591
|
+
"\n",
|
|
592
|
+
"if missing_data:\n",
|
|
593
|
+
" missing_df = pd.DataFrame(missing_data).sort_values(\"Missing Count\", ascending=False)\n",
|
|
594
|
+
" print(\"Columns with Missing Values:\")\n",
|
|
595
|
+
" display(missing_df)\n",
|
|
596
|
+
" \n",
|
|
597
|
+
" fig = charts.bar_chart(\n",
|
|
598
|
+
" missing_df[\"Column\"].tolist(),\n",
|
|
599
|
+
" [float(x.replace(\"%\", \"\")) for x in missing_df[\"Missing %\"].tolist()],\n",
|
|
600
|
+
" title=\"Missing Value Percentage by Column\"\n",
|
|
601
|
+
" )\n",
|
|
602
|
+
" display_figure(fig)\n",
|
|
603
|
+
"else:\n",
|
|
604
|
+
" print(\"No missing values detected.\")"
|
|
605
|
+
]
|
|
606
|
+
},
|
|
607
|
+
{
|
|
608
|
+
"cell_type": "markdown",
|
|
609
|
+
"id": "c921e28e",
|
|
610
|
+
"metadata": {
|
|
611
|
+
"papermill": {
|
|
612
|
+
"duration": 0.011155,
|
|
613
|
+
"end_time": "2026-02-02T13:03:10.090497",
|
|
614
|
+
"exception": false,
|
|
615
|
+
"start_time": "2026-02-02T13:03:10.079342",
|
|
616
|
+
"status": "completed"
|
|
617
|
+
},
|
|
618
|
+
"tags": []
|
|
619
|
+
},
|
|
620
|
+
"source": [
|
|
621
|
+
"## 3.6 Missing Value Patterns\n",
|
|
622
|
+
"\n",
|
|
623
|
+
"**📖 How to Read the Correlation Heatmap:**\n",
|
|
624
|
+
"- **Correlation = 1.0**: Columns always missing together (same rows)\n",
|
|
625
|
+
"- **Correlation > 0.5**: Strong pattern - investigate the relationship\n",
|
|
626
|
+
"- **Correlation ≈ 0**: Independent missing patterns (MCAR likely)"
|
|
627
|
+
]
|
|
628
|
+
},
|
|
629
|
+
{
|
|
630
|
+
"cell_type": "code",
|
|
631
|
+
"execution_count": null,
|
|
632
|
+
"id": "c48c133b",
|
|
633
|
+
"metadata": {
|
|
634
|
+
"execution": {
|
|
635
|
+
"iopub.execute_input": "2026-02-02T13:03:10.118334Z",
|
|
636
|
+
"iopub.status.busy": "2026-02-02T13:03:10.118209Z",
|
|
637
|
+
"iopub.status.idle": "2026-02-02T13:03:10.167892Z",
|
|
638
|
+
"shell.execute_reply": "2026-02-02T13:03:10.167167Z"
|
|
639
|
+
},
|
|
640
|
+
"papermill": {
|
|
641
|
+
"duration": 0.059538,
|
|
642
|
+
"end_time": "2026-02-02T13:03:10.168456",
|
|
643
|
+
"exception": false,
|
|
644
|
+
"start_time": "2026-02-02T13:03:10.108918",
|
|
645
|
+
"status": "completed"
|
|
646
|
+
},
|
|
647
|
+
"tags": []
|
|
648
|
+
},
|
|
649
|
+
"outputs": [],
|
|
650
|
+
"source": [
|
|
651
|
+
"missing_matrix = df.isnull()\n",
|
|
652
|
+
"missing_correlations = missing_matrix.corr()\n",
|
|
653
|
+
"\n",
|
|
654
|
+
"cols_with_missing = [col for col in df.columns if df[col].isnull().any()]\n",
|
|
655
|
+
"if len(cols_with_missing) > 1:\n",
|
|
656
|
+
" print(\"Missing Value Correlations (MCAR vs MAR analysis):\")\n",
|
|
657
|
+
" fig = charts.heatmap(\n",
|
|
658
|
+
" missing_correlations.loc[cols_with_missing, cols_with_missing].values,\n",
|
|
659
|
+
" x_labels=cols_with_missing,\n",
|
|
660
|
+
" y_labels=cols_with_missing,\n",
|
|
661
|
+
" title=\"Missing Value Pattern Correlation\"\n",
|
|
662
|
+
" )\n",
|
|
663
|
+
" display_figure(fig)"
|
|
664
|
+
]
|
|
665
|
+
},
|
|
666
|
+
{
|
|
667
|
+
"cell_type": "markdown",
|
|
668
|
+
"id": "c56db113",
|
|
669
|
+
"metadata": {
|
|
670
|
+
"papermill": {
|
|
671
|
+
"duration": 0.00733,
|
|
672
|
+
"end_time": "2026-02-02T13:03:10.183880",
|
|
673
|
+
"exception": false,
|
|
674
|
+
"start_time": "2026-02-02T13:03:10.176550",
|
|
675
|
+
"status": "completed"
|
|
676
|
+
},
|
|
677
|
+
"tags": []
|
|
678
|
+
},
|
|
679
|
+
"source": [
|
|
680
|
+
"## 3.7 Segment-Aware Outlier Analysis\n",
|
|
681
|
+
"\n",
|
|
682
|
+
"**📖 Why Segment Before Detecting Outliers?**\n",
|
|
683
|
+
"\n",
|
|
684
|
+
"Global outlier detection can produce **false positives** when data contains natural segments:\n",
|
|
685
|
+
"- **Retail vs Enterprise customers**: Order values of $5K may be outliers for retail but normal for enterprise\n",
|
|
686
|
+
"- **New vs Established accounts**: Activity patterns differ dramatically by customer tenure\n",
|
|
687
|
+
"- **Geographic segments**: Regional price differences can appear as outliers globally\n",
|
|
688
|
+
"\n",
|
|
689
|
+
"**⚠️ The Risk:**\n",
|
|
690
|
+
"If you remove \"outliers\" that are actually valid data from a different segment, you lose critical patterns needed for accurate modeling.\n",
|
|
691
|
+
"\n",
|
|
692
|
+
"**📊 What This Analysis Does:**\n",
|
|
693
|
+
"1. Detects natural data segments (using clustering or explicit segment columns)\n",
|
|
694
|
+
"2. Compares global outliers vs segment-specific outliers\n",
|
|
695
|
+
"3. Identifies \"false outliers\" - values flagged globally but normal within their segment\n",
|
|
696
|
+
"4. Recommends whether segment-specific outlier treatment is beneficial"
|
|
697
|
+
]
|
|
698
|
+
},
|
|
699
|
+
{
|
|
700
|
+
"cell_type": "code",
|
|
701
|
+
"execution_count": null,
|
|
702
|
+
"id": "74cff73c",
|
|
703
|
+
"metadata": {
|
|
704
|
+
"execution": {
|
|
705
|
+
"iopub.execute_input": "2026-02-02T13:03:10.199709Z",
|
|
706
|
+
"iopub.status.busy": "2026-02-02T13:03:10.199576Z",
|
|
707
|
+
"iopub.status.idle": "2026-02-02T13:03:10.536146Z",
|
|
708
|
+
"shell.execute_reply": "2026-02-02T13:03:10.535137Z"
|
|
709
|
+
},
|
|
710
|
+
"papermill": {
|
|
711
|
+
"duration": 0.345634,
|
|
712
|
+
"end_time": "2026-02-02T13:03:10.537088",
|
|
713
|
+
"exception": false,
|
|
714
|
+
"start_time": "2026-02-02T13:03:10.191454",
|
|
715
|
+
"status": "completed"
|
|
716
|
+
},
|
|
717
|
+
"tags": []
|
|
718
|
+
},
|
|
719
|
+
"outputs": [],
|
|
720
|
+
"source": [
|
|
721
|
+
"from customer_retention.stages.profiling import SegmentAwareOutlierAnalyzer\n",
|
|
722
|
+
"\n",
|
|
723
|
+
"print(\"=\" * 80)\n",
|
|
724
|
+
"print(\"SEGMENT-AWARE OUTLIER ANALYSIS\")\n",
|
|
725
|
+
"print(\"=\" * 80)\n",
|
|
726
|
+
"\n",
|
|
727
|
+
"# Get numeric columns for analysis\n",
|
|
728
|
+
"numeric_cols = [\n",
|
|
729
|
+
" name for name, col in findings.columns.items()\n",
|
|
730
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
731
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
732
|
+
"]\n",
|
|
733
|
+
"\n",
|
|
734
|
+
"# === CONFIGURATION ===\n",
|
|
735
|
+
"# Option 1: Specify an explicit segment column if you have one (e.g., customer_type, region)\n",
|
|
736
|
+
"SEGMENT_COL = None # e.g., \"customer_segment\", \"account_type\"\n",
|
|
737
|
+
"\n",
|
|
738
|
+
"# Option 2: Load from findings metadata if saved in previous notebook\n",
|
|
739
|
+
"if SEGMENT_COL is None and \"segment_column\" in findings.metadata:\n",
|
|
740
|
+
" SEGMENT_COL = findings.metadata[\"segment_column\"]\n",
|
|
741
|
+
" print(f\"Using segment column from findings: {SEGMENT_COL}\")\n",
|
|
742
|
+
"\n",
|
|
743
|
+
"if numeric_cols:\n",
|
|
744
|
+
" analyzer = SegmentAwareOutlierAnalyzer(max_segments=5)\n",
|
|
745
|
+
" \n",
|
|
746
|
+
" # Run segment-aware analysis\n",
|
|
747
|
+
" segment_result = analyzer.analyze(\n",
|
|
748
|
+
" df,\n",
|
|
749
|
+
" feature_cols=numeric_cols,\n",
|
|
750
|
+
" segment_col=SEGMENT_COL,\n",
|
|
751
|
+
" target_col=findings.target_column\n",
|
|
752
|
+
" )\n",
|
|
753
|
+
" \n",
|
|
754
|
+
" print(f\"\\n📊 SEGMENTATION RESULTS:\")\n",
|
|
755
|
+
" print(f\" Segments detected: {segment_result.n_segments}\")\n",
|
|
756
|
+
" \n",
|
|
757
|
+
" if segment_result.n_segments > 1:\n",
|
|
758
|
+
" print(f\"\\n📈 GLOBAL VS SEGMENT OUTLIER COMPARISON:\")\n",
|
|
759
|
+
" print(\"-\" * 60)\n",
|
|
760
|
+
" \n",
|
|
761
|
+
" comparison_data = []\n",
|
|
762
|
+
" for col in numeric_cols:\n",
|
|
763
|
+
" global_outliers = segment_result.global_analysis[col].outliers_detected\n",
|
|
764
|
+
" segment_outliers = sum(\n",
|
|
765
|
+
" seg[col].outliers_detected \n",
|
|
766
|
+
" for seg in segment_result.segment_analysis.values()\n",
|
|
767
|
+
" if col in seg\n",
|
|
768
|
+
" )\n",
|
|
769
|
+
" false_outliers = segment_result.false_outliers.get(col, 0)\n",
|
|
770
|
+
" \n",
|
|
771
|
+
" if global_outliers > 0:\n",
|
|
772
|
+
" reduction_pct = (global_outliers - segment_outliers) / global_outliers * 100\n",
|
|
773
|
+
" false_pct = false_outliers / global_outliers * 100\n",
|
|
774
|
+
" else:\n",
|
|
775
|
+
" reduction_pct = 0\n",
|
|
776
|
+
" false_pct = 0\n",
|
|
777
|
+
" \n",
|
|
778
|
+
" comparison_data.append({\n",
|
|
779
|
+
" \"Feature\": col,\n",
|
|
780
|
+
" \"Global Outliers\": global_outliers,\n",
|
|
781
|
+
" \"Segment Outliers\": segment_outliers,\n",
|
|
782
|
+
" \"False Outliers\": false_outliers,\n",
|
|
783
|
+
" \"Reduction\": f\"{reduction_pct:.1f}%\"\n",
|
|
784
|
+
" })\n",
|
|
785
|
+
" \n",
|
|
786
|
+
" comparison_df = pd.DataFrame(comparison_data)\n",
|
|
787
|
+
" display(comparison_df)\n",
|
|
788
|
+
" \n",
|
|
789
|
+
" # Show false outlier analysis\n",
|
|
790
|
+
" has_false_outliers = any(segment_result.false_outliers.get(col, 0) > 0 for col in numeric_cols)\n",
|
|
791
|
+
" \n",
|
|
792
|
+
" if has_false_outliers:\n",
|
|
793
|
+
" print(\"\\n⚠️ FALSE OUTLIERS DETECTED:\")\n",
|
|
794
|
+
" print(\" (Global outliers that are normal within their segment)\")\n",
|
|
795
|
+
" for col, count in segment_result.false_outliers.items():\n",
|
|
796
|
+
" if count > 0:\n",
|
|
797
|
+
" global_count = segment_result.global_analysis[col].outliers_detected\n",
|
|
798
|
+
" pct = count / global_count * 100 if global_count > 0 else 0\n",
|
|
799
|
+
" print(f\" • {col}: {count} false outliers ({pct:.1f}% of global)\")\n",
|
|
800
|
+
" \n",
|
|
801
|
+
" # Recommendations\n",
|
|
802
|
+
" print(\"\\n💡 RECOMMENDATIONS:\")\n",
|
|
803
|
+
" if segment_result.segmentation_recommended:\n",
|
|
804
|
+
" print(\" ✅ SEGMENT-SPECIFIC OUTLIER TREATMENT RECOMMENDED\")\n",
|
|
805
|
+
" for rec in segment_result.recommendations:\n",
|
|
806
|
+
" print(f\" • {rec}\")\n",
|
|
807
|
+
" \n",
|
|
808
|
+
" # Add outlier recommendations for columns with high false outlier rate\n",
|
|
809
|
+
" for col, count in segment_result.false_outliers.items():\n",
|
|
810
|
+
" if count > 0:\n",
|
|
811
|
+
" global_count = segment_result.global_analysis[col].outliers_detected\n",
|
|
812
|
+
" false_pct = count / global_count * 100 if global_count > 0 else 0\n",
|
|
813
|
+
" if false_pct > 50: # High false outlier rate\n",
|
|
814
|
+
" registry.add_bronze_outlier(\n",
|
|
815
|
+
" column=col, action=\"segment_aware_cap\",\n",
|
|
816
|
+
" parameters={\"method\": \"segment_iqr\", \"n_segments\": segment_result.n_segments},\n",
|
|
817
|
+
" rationale=f\"{false_pct:.0f}% of global outliers are segment-normal\",\n",
|
|
818
|
+
" source_notebook=\"03_quality_assessment\"\n",
|
|
819
|
+
" )\n",
|
|
820
|
+
" else:\n",
|
|
821
|
+
" print(\" ℹ️ Global outlier treatment is appropriate for this data\")\n",
|
|
822
|
+
" \n",
|
|
823
|
+
" # Rationale\n",
|
|
824
|
+
" print(\"\\n📋 RATIONALE:\")\n",
|
|
825
|
+
" for rationale in segment_result.rationale:\n",
|
|
826
|
+
" print(f\" • {rationale}\")\n",
|
|
827
|
+
" \n",
|
|
828
|
+
" # Visualization: Compare outlier counts\n",
|
|
829
|
+
" cols_with_diff = [\n",
|
|
830
|
+
" row[\"Feature\"] for _, row in comparison_df.iterrows()\n",
|
|
831
|
+
" if row[\"Global Outliers\"] > 0 and row[\"Global Outliers\"] != row[\"Segment Outliers\"]\n",
|
|
832
|
+
" ]\n",
|
|
833
|
+
" \n",
|
|
834
|
+
" if cols_with_diff and len(cols_with_diff) <= 8:\n",
|
|
835
|
+
" fig = go.Figure()\n",
|
|
836
|
+
" \n",
|
|
837
|
+
" global_counts = [comparison_df[comparison_df[\"Feature\"] == c][\"Global Outliers\"].values[0] for c in cols_with_diff]\n",
|
|
838
|
+
" segment_counts = [comparison_df[comparison_df[\"Feature\"] == c][\"Segment Outliers\"].values[0] for c in cols_with_diff]\n",
|
|
839
|
+
" \n",
|
|
840
|
+
" fig.add_trace(go.Bar(name=\"Global Outliers\", x=cols_with_diff, y=global_counts, marker_color=\"#e74c3c\"))\n",
|
|
841
|
+
" fig.add_trace(go.Bar(name=\"Segment Outliers\", x=cols_with_diff, y=segment_counts, marker_color=\"#2ecc71\"))\n",
|
|
842
|
+
" \n",
|
|
843
|
+
" fig.update_layout(\n",
|
|
844
|
+
" barmode=\"group\",\n",
|
|
845
|
+
" title=\"Global vs Segment-Specific Outlier Detection\",\n",
|
|
846
|
+
" xaxis_title=\"Feature\",\n",
|
|
847
|
+
" yaxis_title=\"Outlier Count\",\n",
|
|
848
|
+
" template=\"plotly_white\",\n",
|
|
849
|
+
" height=400\n",
|
|
850
|
+
" )\n",
|
|
851
|
+
" display_figure(fig)\n",
|
|
852
|
+
" else:\n",
|
|
853
|
+
" print(\"\\n ℹ️ Data appears homogeneous (single segment)\")\n",
|
|
854
|
+
" print(\" → Proceeding with standard global outlier detection\")\n",
|
|
855
|
+
" \n",
|
|
856
|
+
" # Store result in findings metadata for use in later notebooks\n",
|
|
857
|
+
" findings.metadata[\"segment_aware_analysis\"] = {\n",
|
|
858
|
+
" \"n_segments\": segment_result.n_segments,\n",
|
|
859
|
+
" \"segmentation_recommended\": segment_result.segmentation_recommended,\n",
|
|
860
|
+
" \"recommendations\": segment_result.recommendations\n",
|
|
861
|
+
" }\n",
|
|
862
|
+
"else:\n",
|
|
863
|
+
" print(\"\\nNo numeric columns to analyze for outliers.\")"
|
|
864
|
+
]
|
|
865
|
+
},
|
|
866
|
+
{
|
|
867
|
+
"cell_type": "markdown",
|
|
868
|
+
"id": "6472d7a4",
|
|
869
|
+
"metadata": {
|
|
870
|
+
"papermill": {
|
|
871
|
+
"duration": 0.008068,
|
|
872
|
+
"end_time": "2026-02-02T13:03:10.552669",
|
|
873
|
+
"exception": false,
|
|
874
|
+
"start_time": "2026-02-02T13:03:10.544601",
|
|
875
|
+
"status": "completed"
|
|
876
|
+
},
|
|
877
|
+
"tags": []
|
|
878
|
+
},
|
|
879
|
+
"source": [
|
|
880
|
+
"## 3.8 Global Outlier Detection\n",
|
|
881
|
+
"\n",
|
|
882
|
+
"**📖 IQR Method Explained:**\n",
|
|
883
|
+
"- **Q1** = 25th percentile, **Q3** = 75th percentile\n",
|
|
884
|
+
"- **IQR** = Q3 - Q1 (the middle 50% of data)\n",
|
|
885
|
+
"- **Lower Bound** = Q1 - 1.5 × IQR\n",
|
|
886
|
+
"- **Upper Bound** = Q3 + 1.5 × IQR\n",
|
|
887
|
+
"- Values outside these bounds are considered outliers\n",
|
|
888
|
+
"\n",
|
|
889
|
+
"**⚠️ Important Considerations:**\n",
|
|
890
|
+
"- Review section 3.7 above to determine if global or segment-specific outlier treatment is appropriate\n",
|
|
891
|
+
"- Outliers in rate fields (>100%) are likely errors → Cap at 100\n",
|
|
892
|
+
"- Outliers in amount fields may be valid high-value customers → Keep but consider capping for modeling\n",
|
|
893
|
+
"- High outlier % (>10%) suggests heavy-tailed distribution → Consider log transform instead of capping"
|
|
894
|
+
]
|
|
895
|
+
},
|
|
896
|
+
{
|
|
897
|
+
"cell_type": "code",
|
|
898
|
+
"execution_count": null,
|
|
899
|
+
"id": "4029d109",
|
|
900
|
+
"metadata": {
|
|
901
|
+
"execution": {
|
|
902
|
+
"iopub.execute_input": "2026-02-02T13:03:10.570427Z",
|
|
903
|
+
"iopub.status.busy": "2026-02-02T13:03:10.570253Z",
|
|
904
|
+
"iopub.status.idle": "2026-02-02T13:03:10.633647Z",
|
|
905
|
+
"shell.execute_reply": "2026-02-02T13:03:10.629513Z"
|
|
906
|
+
},
|
|
907
|
+
"papermill": {
|
|
908
|
+
"duration": 0.078553,
|
|
909
|
+
"end_time": "2026-02-02T13:03:10.638719",
|
|
910
|
+
"exception": false,
|
|
911
|
+
"start_time": "2026-02-02T13:03:10.560166",
|
|
912
|
+
"status": "completed"
|
|
913
|
+
},
|
|
914
|
+
"tags": []
|
|
915
|
+
},
|
|
916
|
+
"outputs": [],
|
|
917
|
+
"source": [
|
|
918
|
+
"print(\"=\" * 80)\n",
|
|
919
|
+
"print(\"OUTLIER DETECTION (IQR Method)\")\n",
|
|
920
|
+
"print(\"=\" * 80)\n",
|
|
921
|
+
"\n",
|
|
922
|
+
"numeric_cols = [\n",
|
|
923
|
+
" name for name, col in findings.columns.items()\n",
|
|
924
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
925
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
926
|
+
"]\n",
|
|
927
|
+
"\n",
|
|
928
|
+
"# Build comprehensive outlier table\n",
|
|
929
|
+
"outlier_data = []\n",
|
|
930
|
+
"for col_name in numeric_cols:\n",
|
|
931
|
+
" series = df[col_name].dropna()\n",
|
|
932
|
+
" q1 = series.quantile(0.25)\n",
|
|
933
|
+
" q3 = series.quantile(0.75)\n",
|
|
934
|
+
" iqr = q3 - q1\n",
|
|
935
|
+
" lower_bound = q1 - 1.5 * iqr\n",
|
|
936
|
+
" upper_bound = q3 + 1.5 * iqr\n",
|
|
937
|
+
" \n",
|
|
938
|
+
" outliers_low = (series < lower_bound).sum()\n",
|
|
939
|
+
" outliers_high = (series > upper_bound).sum()\n",
|
|
940
|
+
" total_outliers = outliers_low + outliers_high\n",
|
|
941
|
+
" \n",
|
|
942
|
+
" outlier_data.append({\n",
|
|
943
|
+
" \"feature\": col_name,\n",
|
|
944
|
+
" \"Q1\": q1,\n",
|
|
945
|
+
" \"Q3\": q3,\n",
|
|
946
|
+
" \"IQR\": iqr,\n",
|
|
947
|
+
" \"lower_bound\": lower_bound,\n",
|
|
948
|
+
" \"upper_bound\": upper_bound,\n",
|
|
949
|
+
" \"outliers_low\": outliers_low,\n",
|
|
950
|
+
" \"outliers_high\": outliers_high,\n",
|
|
951
|
+
" \"total_outliers\": total_outliers,\n",
|
|
952
|
+
" \"outlier_pct\": total_outliers / len(series) * 100\n",
|
|
953
|
+
" })\n",
|
|
954
|
+
"\n",
|
|
955
|
+
"outlier_df = pd.DataFrame(outlier_data)\n",
|
|
956
|
+
"\n",
|
|
957
|
+
"# Display IQR bounds table\n",
|
|
958
|
+
"print(\"\\n📊 IQR BOUNDS TABLE:\")\n",
|
|
959
|
+
"bounds_display = outlier_df[[\"feature\", \"Q1\", \"Q3\", \"IQR\", \"lower_bound\", \"upper_bound\", \n",
|
|
960
|
+
" \"outliers_low\", \"outliers_high\"]].copy()\n",
|
|
961
|
+
"for col in [\"Q1\", \"Q3\", \"IQR\", \"lower_bound\", \"upper_bound\"]:\n",
|
|
962
|
+
" bounds_display[col] = bounds_display[col].apply(lambda x: f\"{x:.2f}\")\n",
|
|
963
|
+
"display(bounds_display)\n",
|
|
964
|
+
"\n",
|
|
965
|
+
"# Outlier summary for columns with issues\n",
|
|
966
|
+
"cols_with_outliers = outlier_df[outlier_df[\"total_outliers\"] > 0].copy()\n",
|
|
967
|
+
"if len(cols_with_outliers) > 0:\n",
|
|
968
|
+
" print(\"\\n⚠️ COLUMNS WITH OUTLIERS:\")\n",
|
|
969
|
+
" for _, row in cols_with_outliers.iterrows():\n",
|
|
970
|
+
" severity = \"🔴 HIGH\" if row[\"outlier_pct\"] > 10 else \"🟡 MEDIUM\" if row[\"outlier_pct\"] > 5 else \"🟢 LOW\"\n",
|
|
971
|
+
" print(f\"\\n {row['feature']}: {row['total_outliers']:,} outliers ({row['outlier_pct']:.2f}%) {severity}\")\n",
|
|
972
|
+
" print(f\" Lower bound: {row['lower_bound']:.2f} | Upper bound: {row['upper_bound']:.2f}\")\n",
|
|
973
|
+
" if row[\"outliers_low\"] > 0:\n",
|
|
974
|
+
" print(f\" Below lower: {row['outliers_low']:,}\")\n",
|
|
975
|
+
" if row[\"outliers_high\"] > 0:\n",
|
|
976
|
+
" print(f\" Above upper: {row['outliers_high']:,}\")\n",
|
|
977
|
+
" \n",
|
|
978
|
+
" # Determine action and add recommendation (skip if segment-aware already added)\n",
|
|
979
|
+
" col_name = row['feature']\n",
|
|
980
|
+
" existing_outlier_recs = [r for r in registry.bronze.outlier_handling if r.target_column == col_name]\n",
|
|
981
|
+
" \n",
|
|
982
|
+
" if not existing_outlier_recs and row[\"outlier_pct\"] > 5: # Only add if significant and not already handled\n",
|
|
983
|
+
" if row[\"outlier_pct\"] > 10:\n",
|
|
984
|
+
" action = \"log_transform\"\n",
|
|
985
|
+
" rationale = f\"{row['outlier_pct']:.1f}% outliers - heavy tails require log transform\"\n",
|
|
986
|
+
" print(\" → Consider log transform or RobustScaler\")\n",
|
|
987
|
+
" else:\n",
|
|
988
|
+
" action = \"winsorize\"\n",
|
|
989
|
+
" rationale = f\"{row['outlier_pct']:.1f}% outliers - winsorize to 1st/99th percentile\"\n",
|
|
990
|
+
" print(\" → Consider Winsorization (clip to 1st/99th percentile)\")\n",
|
|
991
|
+
" \n",
|
|
992
|
+
" registry.add_bronze_outlier(\n",
|
|
993
|
+
" column=col_name, action=action,\n",
|
|
994
|
+
" parameters={\"method\": \"iqr\", \"lower_bound\": row[\"lower_bound\"], \"upper_bound\": row[\"upper_bound\"]},\n",
|
|
995
|
+
" rationale=rationale,\n",
|
|
996
|
+
" source_notebook=\"03_quality_assessment\"\n",
|
|
997
|
+
" )\n",
|
|
998
|
+
" elif row[\"outlier_pct\"] <= 5:\n",
|
|
999
|
+
" print(\" → Minor issue, can cap at IQR bounds if needed\")\n",
|
|
1000
|
+
"else:\n",
|
|
1001
|
+
" print(\"\\n✅ No significant outliers detected in numeric columns\")\n",
|
|
1002
|
+
"\n",
|
|
1003
|
+
"# Box plots for columns with outliers\n",
|
|
1004
|
+
"if len(cols_with_outliers) > 0 and len(cols_with_outliers) <= 6:\n",
|
|
1005
|
+
" outlier_cols = cols_with_outliers[\"feature\"].tolist()\n",
|
|
1006
|
+
" \n",
|
|
1007
|
+
" fig = make_subplots(rows=1, cols=len(outlier_cols), subplot_titles=outlier_cols)\n",
|
|
1008
|
+
" \n",
|
|
1009
|
+
" for i, col in enumerate(outlier_cols, 1):\n",
|
|
1010
|
+
" fig.add_trace(\n",
|
|
1011
|
+
" go.Box(y=df[col].dropna(), name=col, boxpoints=\"outliers\",\n",
|
|
1012
|
+
" marker_color=\"#3498db\", showlegend=False),\n",
|
|
1013
|
+
" row=1, col=i\n",
|
|
1014
|
+
" )\n",
|
|
1015
|
+
" \n",
|
|
1016
|
+
" fig.update_layout(height=400, title_text=\"Outlier Distribution (Box Plots)\",\n",
|
|
1017
|
+
" template=\"plotly_white\")\n",
|
|
1018
|
+
" display_figure(fig)"
|
|
1019
|
+
]
|
|
1020
|
+
},
|
|
1021
|
+
{
|
|
1022
|
+
"cell_type": "markdown",
|
|
1023
|
+
"id": "4d39c9ea",
|
|
1024
|
+
"metadata": {
|
|
1025
|
+
"papermill": {
|
|
1026
|
+
"duration": 0.011699,
|
|
1027
|
+
"end_time": "2026-02-02T13:03:10.667247",
|
|
1028
|
+
"exception": false,
|
|
1029
|
+
"start_time": "2026-02-02T13:03:10.655548",
|
|
1030
|
+
"status": "completed"
|
|
1031
|
+
},
|
|
1032
|
+
"tags": []
|
|
1033
|
+
},
|
|
1034
|
+
"source": [
|
|
1035
|
+
"## 3.9 Date Logic Validation\n",
|
|
1036
|
+
"\n",
|
|
1037
|
+
"**📖 What This Checks:**\n",
|
|
1038
|
+
"- Date ranges and suspicious placeholder dates (e.g., 1/1/1900, 1/1/2004)\n",
|
|
1039
|
+
"- Date sequence violations if `DATE_SEQUENCE` is configured below\n",
|
|
1040
|
+
"\n",
|
|
1041
|
+
"**⚠️ Common Issues:**\n",
|
|
1042
|
+
"- Very old dates (pre-2005): Often placeholder values → Set to NULL\n",
|
|
1043
|
+
"- Sequence violations (e.g., `last_purchase < first_purchase`): Data entry errors → Flag for review\n",
|
|
1044
|
+
"\n",
|
|
1045
|
+
"**💡 Configuration:**\n",
|
|
1046
|
+
"Set `DATE_SEQUENCE` below to validate that dates occur in expected chronological order."
|
|
1047
|
+
]
|
|
1048
|
+
},
|
|
1049
|
+
{
|
|
1050
|
+
"cell_type": "code",
|
|
1051
|
+
"execution_count": null,
|
|
1052
|
+
"id": "4a4d5ce7",
|
|
1053
|
+
"metadata": {
|
|
1054
|
+
"execution": {
|
|
1055
|
+
"iopub.execute_input": "2026-02-02T13:03:10.686601Z",
|
|
1056
|
+
"iopub.status.busy": "2026-02-02T13:03:10.686469Z",
|
|
1057
|
+
"iopub.status.idle": "2026-02-02T13:03:10.692184Z",
|
|
1058
|
+
"shell.execute_reply": "2026-02-02T13:03:10.691521Z"
|
|
1059
|
+
},
|
|
1060
|
+
"papermill": {
|
|
1061
|
+
"duration": 0.016241,
|
|
1062
|
+
"end_time": "2026-02-02T13:03:10.692788",
|
|
1063
|
+
"exception": false,
|
|
1064
|
+
"start_time": "2026-02-02T13:03:10.676547",
|
|
1065
|
+
"status": "completed"
|
|
1066
|
+
},
|
|
1067
|
+
"tags": []
|
|
1068
|
+
},
|
|
1069
|
+
"outputs": [],
|
|
1070
|
+
"source": [
|
|
1071
|
+
"# === DATE SEQUENCE CONFIGURATION ===\n",
|
|
1072
|
+
"# Define expected chronological order of date columns (earliest to latest)\n",
|
|
1073
|
+
"# Example: [\"account_created\", \"first_purchase\", \"last_purchase\"]\n",
|
|
1074
|
+
"# Set to None or empty list to skip sequence validation\n",
|
|
1075
|
+
"\n",
|
|
1076
|
+
"# Option 1: Override here\n",
|
|
1077
|
+
"DATE_SEQUENCE = None # e.g., [\"created\", \"firstorder\", \"lastorder\"]\n",
|
|
1078
|
+
"\n",
|
|
1079
|
+
"# Option 2: Load from findings (saved in notebook 01)\n",
|
|
1080
|
+
"if DATE_SEQUENCE is None and \"date_sequence\" in findings.metadata:\n",
|
|
1081
|
+
" DATE_SEQUENCE = findings.metadata[\"date_sequence\"]\n",
|
|
1082
|
+
" print(f\"Loaded date sequence from findings: {DATE_SEQUENCE}\")\n",
|
|
1083
|
+
"\n",
|
|
1084
|
+
"# Detect date columns from findings\n",
|
|
1085
|
+
"date_cols = [name for name, col in findings.columns.items() \n",
|
|
1086
|
+
" if col.inferred_type == ColumnType.DATETIME]\n",
|
|
1087
|
+
"\n",
|
|
1088
|
+
"print(\"=\" * 60)\n",
|
|
1089
|
+
"print(\"DATE LOGIC VALIDATION\")\n",
|
|
1090
|
+
"print(\"=\" * 60)\n",
|
|
1091
|
+
"print(f\"\\nDetected date columns: {date_cols}\")\n",
|
|
1092
|
+
"\n",
|
|
1093
|
+
"if date_cols:\n",
|
|
1094
|
+
" df_dates = df.copy()\n",
|
|
1095
|
+
" for col in date_cols:\n",
|
|
1096
|
+
" df_dates[col] = pd.to_datetime(df_dates[col], errors='coerce', format='mixed')\n",
|
|
1097
|
+
" \n",
|
|
1098
|
+
" # Date ranges\n",
|
|
1099
|
+
" print(\"\\n📅 DATE RANGES:\")\n",
|
|
1100
|
+
" for col in date_cols:\n",
|
|
1101
|
+
" print(f\" {col}: {df_dates[col].min()} to {df_dates[col].max()}\")\n",
|
|
1102
|
+
" \n",
|
|
1103
|
+
" # Placeholder detection\n",
|
|
1104
|
+
" print(\"\\n🕵️ PLACEHOLDER DATE DETECTION:\")\n",
|
|
1105
|
+
" for col in date_cols:\n",
|
|
1106
|
+
" old_dates = (df_dates[col] < '2005-01-01').sum()\n",
|
|
1107
|
+
" if old_dates > 0:\n",
|
|
1108
|
+
" print(f\" {col}: {old_dates:,} dates before 2005 (possible placeholders)\")\n",
|
|
1109
|
+
" else:\n",
|
|
1110
|
+
" print(f\" {col}: No suspicious early dates\")\n",
|
|
1111
|
+
" \n",
|
|
1112
|
+
" # Sequence validation\n",
|
|
1113
|
+
" if DATE_SEQUENCE and len(DATE_SEQUENCE) >= 2:\n",
|
|
1114
|
+
" valid_sequence_cols = [c for c in DATE_SEQUENCE if c in date_cols]\n",
|
|
1115
|
+
" if len(valid_sequence_cols) >= 2:\n",
|
|
1116
|
+
" print(f\"\\n🔗 DATE SEQUENCE VALIDATION:\")\n",
|
|
1117
|
+
" print(f\" Expected order: {' ≤ '.join(valid_sequence_cols)}\")\n",
|
|
1118
|
+
" \n",
|
|
1119
|
+
" total_violations = 0\n",
|
|
1120
|
+
" for i in range(len(valid_sequence_cols) - 1):\n",
|
|
1121
|
+
" col1, col2 = valid_sequence_cols[i], valid_sequence_cols[i + 1]\n",
|
|
1122
|
+
" # Check where col2 < col1 (violation)\n",
|
|
1123
|
+
" mask = df_dates[col1].notna() & df_dates[col2].notna()\n",
|
|
1124
|
+
" violations = (df_dates.loc[mask, col2] < df_dates.loc[mask, col1]).sum()\n",
|
|
1125
|
+
" total_violations += violations\n",
|
|
1126
|
+
" \n",
|
|
1127
|
+
" if violations > 0:\n",
|
|
1128
|
+
" pct = violations / mask.sum() * 100\n",
|
|
1129
|
+
" print(f\" ⚠️ {col2} < {col1}: {violations:,} violations ({pct:.2f}%)\")\n",
|
|
1130
|
+
" else:\n",
|
|
1131
|
+
" print(f\" ✓ {col1} ≤ {col2}: No violations\")\n",
|
|
1132
|
+
" \n",
|
|
1133
|
+
" if total_violations == 0:\n",
|
|
1134
|
+
" print(\"\\n ✅ All date sequences valid\")\n",
|
|
1135
|
+
" else:\n",
|
|
1136
|
+
" print(f\"\\n ⚠️ Total sequence violations: {total_violations:,}\")\n",
|
|
1137
|
+
" else:\n",
|
|
1138
|
+
" print(f\"\\n⚠️ DATE_SEQUENCE columns not found in data: {DATE_SEQUENCE}\")\n",
|
|
1139
|
+
" print(f\" Available date columns: {date_cols}\")\n",
|
|
1140
|
+
" else:\n",
|
|
1141
|
+
" print(\"\\n💡 TIP: Set DATE_SEQUENCE above or in notebook 01 to enable sequence validation\")\n",
|
|
1142
|
+
" if len(date_cols) >= 2:\n",
|
|
1143
|
+
" print(f\" Available date columns: {date_cols}\")\n",
|
|
1144
|
+
"else:\n",
|
|
1145
|
+
" print(\"\\nNo date columns detected.\")"
|
|
1146
|
+
]
|
|
1147
|
+
},
|
|
1148
|
+
{
|
|
1149
|
+
"cell_type": "markdown",
|
|
1150
|
+
"id": "e4ba09b9",
|
|
1151
|
+
"metadata": {
|
|
1152
|
+
"papermill": {
|
|
1153
|
+
"duration": 0.010265,
|
|
1154
|
+
"end_time": "2026-02-02T13:03:10.710981",
|
|
1155
|
+
"exception": false,
|
|
1156
|
+
"start_time": "2026-02-02T13:03:10.700716",
|
|
1157
|
+
"status": "completed"
|
|
1158
|
+
},
|
|
1159
|
+
"tags": []
|
|
1160
|
+
},
|
|
1161
|
+
"source": [
|
|
1162
|
+
"## 3.10 Binary Field Validation\n",
|
|
1163
|
+
"\n",
|
|
1164
|
+
"Binary fields should contain only 0 and 1 values. Any other values indicate data quality issues."
|
|
1165
|
+
]
|
|
1166
|
+
},
|
|
1167
|
+
{
|
|
1168
|
+
"cell_type": "code",
|
|
1169
|
+
"execution_count": null,
|
|
1170
|
+
"id": "f796b6a4",
|
|
1171
|
+
"metadata": {
|
|
1172
|
+
"execution": {
|
|
1173
|
+
"iopub.execute_input": "2026-02-02T13:03:10.728169Z",
|
|
1174
|
+
"iopub.status.busy": "2026-02-02T13:03:10.728054Z",
|
|
1175
|
+
"iopub.status.idle": "2026-02-02T13:03:10.735906Z",
|
|
1176
|
+
"shell.execute_reply": "2026-02-02T13:03:10.735438Z"
|
|
1177
|
+
},
|
|
1178
|
+
"papermill": {
|
|
1179
|
+
"duration": 0.017183,
|
|
1180
|
+
"end_time": "2026-02-02T13:03:10.736484",
|
|
1181
|
+
"exception": false,
|
|
1182
|
+
"start_time": "2026-02-02T13:03:10.719301",
|
|
1183
|
+
"status": "completed"
|
|
1184
|
+
},
|
|
1185
|
+
"tags": []
|
|
1186
|
+
},
|
|
1187
|
+
"outputs": [],
|
|
1188
|
+
"source": [
|
|
1189
|
+
"binary_cols = [name for name, col in findings.columns.items() \n",
|
|
1190
|
+
" if col.inferred_type == ColumnType.BINARY\n",
|
|
1191
|
+
" and name not in TEMPORAL_METADATA_COLS]\n",
|
|
1192
|
+
"\n",
|
|
1193
|
+
"print(\"=\" * 60)\n",
|
|
1194
|
+
"print(\"BINARY FIELD VALIDATION\")\n",
|
|
1195
|
+
"print(\"=\" * 60)\n",
|
|
1196
|
+
"print(f\"\\nDetected binary columns: {binary_cols}\")\n",
|
|
1197
|
+
"\n",
|
|
1198
|
+
"if binary_cols:\n",
|
|
1199
|
+
" binary_results = []\n",
|
|
1200
|
+
" for col in binary_cols:\n",
|
|
1201
|
+
" unique_vals = sorted(df[col].dropna().unique())\n",
|
|
1202
|
+
" is_valid = set(unique_vals).issubset({0, 1, 0.0, 1.0})\n",
|
|
1203
|
+
" count_0 = (df[col] == 0).sum()\n",
|
|
1204
|
+
" count_1 = (df[col] == 1).sum()\n",
|
|
1205
|
+
" total = count_0 + count_1\n",
|
|
1206
|
+
" \n",
|
|
1207
|
+
" binary_results.append({\n",
|
|
1208
|
+
" 'column': col,\n",
|
|
1209
|
+
" 'unique_values': unique_vals,\n",
|
|
1210
|
+
" 'is_valid': is_valid,\n",
|
|
1211
|
+
" 'count_0': count_0,\n",
|
|
1212
|
+
" 'count_1': count_1,\n",
|
|
1213
|
+
" 'pct_1': count_1 / total * 100 if total > 0 else 0\n",
|
|
1214
|
+
" })\n",
|
|
1215
|
+
" \n",
|
|
1216
|
+
" status = \"✓\" if is_valid else \"⚠️\"\n",
|
|
1217
|
+
" print(f\"\\n{status} {col}:\")\n",
|
|
1218
|
+
" print(f\" Unique values: {unique_vals}\")\n",
|
|
1219
|
+
" print(f\" 0 (No): {count_0:,} ({count_0/total*100:.1f}%)\")\n",
|
|
1220
|
+
" print(f\" 1 (Yes): {count_1:,} ({count_1/total*100:.1f}%)\")\n",
|
|
1221
|
+
" \n",
|
|
1222
|
+
" if not is_valid:\n",
|
|
1223
|
+
" invalid_vals = [v for v in unique_vals if v not in [0, 1, 0.0, 1.0]]\n",
|
|
1224
|
+
" print(f\" ⚠️ Invalid values found: {invalid_vals}\")\n",
|
|
1225
|
+
"\n",
|
|
1226
|
+
" if len(binary_cols) <= 6:\n",
|
|
1227
|
+
" n_cols = len(binary_cols)\n",
|
|
1228
|
+
" fig = make_subplots(rows=1, cols=n_cols, subplot_titles=binary_cols)\n",
|
|
1229
|
+
" \n",
|
|
1230
|
+
" for i, col in enumerate(binary_cols, 1):\n",
|
|
1231
|
+
" counts = df[col].value_counts().sort_index()\n",
|
|
1232
|
+
" fig.add_trace(\n",
|
|
1233
|
+
" go.Bar(x=['No (0)', 'Yes (1)'], y=[counts.get(0, 0), counts.get(1, 0)],\n",
|
|
1234
|
+
" marker_color=['#d62728', '#2ca02c'], showlegend=False),\n",
|
|
1235
|
+
" row=1, col=i\n",
|
|
1236
|
+
" )\n",
|
|
1237
|
+
" \n",
|
|
1238
|
+
" fig.update_layout(height=350, title_text=\"Binary Field Distributions\",\n",
|
|
1239
|
+
" template='plotly_white')\n",
|
|
1240
|
+
" display_figure(fig)\n",
|
|
1241
|
+
"else:\n",
|
|
1242
|
+
" print(\"\\nNo binary columns detected.\")"
|
|
1243
|
+
]
|
|
1244
|
+
},
|
|
1245
|
+
{
|
|
1246
|
+
"cell_type": "markdown",
|
|
1247
|
+
"id": "86ff0afa",
|
|
1248
|
+
"metadata": {
|
|
1249
|
+
"papermill": {
|
|
1250
|
+
"duration": 0.007937,
|
|
1251
|
+
"end_time": "2026-02-02T13:03:10.752592",
|
|
1252
|
+
"exception": false,
|
|
1253
|
+
"start_time": "2026-02-02T13:03:10.744655",
|
|
1254
|
+
"status": "completed"
|
|
1255
|
+
},
|
|
1256
|
+
"tags": []
|
|
1257
|
+
},
|
|
1258
|
+
"source": [
|
|
1259
|
+
"## 3.11 Data Consistency Checks\n",
|
|
1260
|
+
"\n",
|
|
1261
|
+
"Check for case variants, leading/trailing spaces, and other string inconsistencies."
|
|
1262
|
+
]
|
|
1263
|
+
},
|
|
1264
|
+
{
|
|
1265
|
+
"cell_type": "code",
|
|
1266
|
+
"execution_count": null,
|
|
1267
|
+
"id": "3639dacf",
|
|
1268
|
+
"metadata": {
|
|
1269
|
+
"execution": {
|
|
1270
|
+
"iopub.execute_input": "2026-02-02T13:03:10.769450Z",
|
|
1271
|
+
"iopub.status.busy": "2026-02-02T13:03:10.769328Z",
|
|
1272
|
+
"iopub.status.idle": "2026-02-02T13:03:10.777257Z",
|
|
1273
|
+
"shell.execute_reply": "2026-02-02T13:03:10.776709Z"
|
|
1274
|
+
},
|
|
1275
|
+
"papermill": {
|
|
1276
|
+
"duration": 0.017719,
|
|
1277
|
+
"end_time": "2026-02-02T13:03:10.777906",
|
|
1278
|
+
"exception": false,
|
|
1279
|
+
"start_time": "2026-02-02T13:03:10.760187",
|
|
1280
|
+
"status": "completed"
|
|
1281
|
+
},
|
|
1282
|
+
"tags": []
|
|
1283
|
+
},
|
|
1284
|
+
"outputs": [],
|
|
1285
|
+
"source": [
|
|
1286
|
+
"consistency_issues = []\n",
|
|
1287
|
+
"\n",
|
|
1288
|
+
"for col_name in df.select_dtypes(include=['object']).columns:\n",
|
|
1289
|
+
" if col_name in TEMPORAL_METADATA_COLS:\n",
|
|
1290
|
+
" continue\n",
|
|
1291
|
+
" unique_vals = df[col_name].dropna().unique()\n",
|
|
1292
|
+
" case_variants = {}\n",
|
|
1293
|
+
" for val in unique_vals:\n",
|
|
1294
|
+
" lower_val = str(val).lower().strip()\n",
|
|
1295
|
+
" if lower_val not in case_variants:\n",
|
|
1296
|
+
" case_variants[lower_val] = []\n",
|
|
1297
|
+
" case_variants[lower_val].append(val)\n",
|
|
1298
|
+
" \n",
|
|
1299
|
+
" for lower_val, variants in case_variants.items():\n",
|
|
1300
|
+
" if len(variants) > 1:\n",
|
|
1301
|
+
" consistency_issues.append({\n",
|
|
1302
|
+
" \"Column\": col_name,\n",
|
|
1303
|
+
" \"Issue\": \"Case/Spacing Variants\",\n",
|
|
1304
|
+
" \"Details\": str(variants[:5]),\n",
|
|
1305
|
+
" \"variants\": variants\n",
|
|
1306
|
+
" })\n",
|
|
1307
|
+
"\n",
|
|
1308
|
+
"if consistency_issues:\n",
|
|
1309
|
+
" print(\"Data Consistency Issues:\")\n",
|
|
1310
|
+
" display(pd.DataFrame([{k: v for k, v in issue.items() if k != \"variants\"} for issue in consistency_issues]))\n",
|
|
1311
|
+
" \n",
|
|
1312
|
+
" # Add consistency recommendations\n",
|
|
1313
|
+
" for issue in consistency_issues:\n",
|
|
1314
|
+
" registry.add_bronze_consistency(\n",
|
|
1315
|
+
" column=issue[\"Column\"],\n",
|
|
1316
|
+
" issue_type=\"case_variants\",\n",
|
|
1317
|
+
" action=\"normalize_lower\",\n",
|
|
1318
|
+
" variants=issue[\"variants\"][:5],\n",
|
|
1319
|
+
" rationale=f\"{len(issue['variants'])} case/spacing variants detected\",\n",
|
|
1320
|
+
" source_notebook=\"03_quality_assessment\"\n",
|
|
1321
|
+
" )\n",
|
|
1322
|
+
"else:\n",
|
|
1323
|
+
" print(\"No consistency issues detected.\")"
|
|
1324
|
+
]
|
|
1325
|
+
},
|
|
1326
|
+
{
|
|
1327
|
+
"cell_type": "markdown",
|
|
1328
|
+
"id": "2a3c2f34",
|
|
1329
|
+
"metadata": {
|
|
1330
|
+
"papermill": {
|
|
1331
|
+
"duration": 0.007977,
|
|
1332
|
+
"end_time": "2026-02-02T13:03:10.793870",
|
|
1333
|
+
"exception": false,
|
|
1334
|
+
"start_time": "2026-02-02T13:03:10.785893",
|
|
1335
|
+
"status": "completed"
|
|
1336
|
+
},
|
|
1337
|
+
"tags": []
|
|
1338
|
+
},
|
|
1339
|
+
"source": [
|
|
1340
|
+
"## 3.12 Quality Improvement Recommendations\n",
|
|
1341
|
+
"\n",
|
|
1342
|
+
"Automated recommendations based on the issues detected above."
|
|
1343
|
+
]
|
|
1344
|
+
},
|
|
1345
|
+
{
|
|
1346
|
+
"cell_type": "code",
|
|
1347
|
+
"execution_count": null,
|
|
1348
|
+
"id": "a9ba7d76",
|
|
1349
|
+
"metadata": {
|
|
1350
|
+
"execution": {
|
|
1351
|
+
"iopub.execute_input": "2026-02-02T13:03:10.811120Z",
|
|
1352
|
+
"iopub.status.busy": "2026-02-02T13:03:10.810941Z",
|
|
1353
|
+
"iopub.status.idle": "2026-02-02T13:03:10.824714Z",
|
|
1354
|
+
"shell.execute_reply": "2026-02-02T13:03:10.823869Z"
|
|
1355
|
+
},
|
|
1356
|
+
"papermill": {
|
|
1357
|
+
"duration": 0.023502,
|
|
1358
|
+
"end_time": "2026-02-02T13:03:10.825506",
|
|
1359
|
+
"exception": false,
|
|
1360
|
+
"start_time": "2026-02-02T13:03:10.802004",
|
|
1361
|
+
"status": "completed"
|
|
1362
|
+
},
|
|
1363
|
+
"tags": []
|
|
1364
|
+
},
|
|
1365
|
+
"outputs": [],
|
|
1366
|
+
"source": [
|
|
1367
|
+
"from customer_retention.analysis.auto_explorer import RecommendationEngine\n",
|
|
1368
|
+
"\n",
|
|
1369
|
+
"recommender = RecommendationEngine()\n",
|
|
1370
|
+
"cleaning_recs = recommender.recommend_cleaning(findings)\n",
|
|
1371
|
+
"\n",
|
|
1372
|
+
"print(\"=\" * 80)\n",
|
|
1373
|
+
"print(\"QUALITY IMPROVEMENT RECOMMENDATIONS\")\n",
|
|
1374
|
+
"print(\"=\" * 80)\n",
|
|
1375
|
+
"\n",
|
|
1376
|
+
"if cleaning_recs:\n",
|
|
1377
|
+
" # Group by severity\n",
|
|
1378
|
+
" high_severity = [r for r in cleaning_recs if r.severity == \"high\"]\n",
|
|
1379
|
+
" medium_severity = [r for r in cleaning_recs if r.severity == \"medium\"]\n",
|
|
1380
|
+
" low_severity = [r for r in cleaning_recs if r.severity == \"low\"]\n",
|
|
1381
|
+
" \n",
|
|
1382
|
+
" if high_severity:\n",
|
|
1383
|
+
" print(\"\\n🔴 HIGH PRIORITY (must fix before modeling):\")\n",
|
|
1384
|
+
" print(\"-\" * 60)\n",
|
|
1385
|
+
" for rec in high_severity:\n",
|
|
1386
|
+
" print(f\"\\n 📌 {rec.column_name}\")\n",
|
|
1387
|
+
" print(f\" Issue: {rec.description}\")\n",
|
|
1388
|
+
" print(f\" Strategy: {rec.strategy_label}\")\n",
|
|
1389
|
+
" print(f\" Impact: {rec.problem_impact}\")\n",
|
|
1390
|
+
" if rec.action_steps:\n",
|
|
1391
|
+
" print(\" Action Steps:\")\n",
|
|
1392
|
+
" for step in rec.action_steps:\n",
|
|
1393
|
+
" print(f\" • {step}\")\n",
|
|
1394
|
+
" \n",
|
|
1395
|
+
" if medium_severity:\n",
|
|
1396
|
+
" print(\"\\n🟡 MEDIUM PRIORITY (recommended fixes):\")\n",
|
|
1397
|
+
" print(\"-\" * 60)\n",
|
|
1398
|
+
" for rec in medium_severity:\n",
|
|
1399
|
+
" print(f\"\\n 📌 {rec.column_name}\")\n",
|
|
1400
|
+
" print(f\" Issue: {rec.description}\")\n",
|
|
1401
|
+
" print(f\" Strategy: {rec.strategy_label}\")\n",
|
|
1402
|
+
" print(f\" Impact: {rec.problem_impact}\")\n",
|
|
1403
|
+
" if rec.action_steps:\n",
|
|
1404
|
+
" print(\" Action Steps:\")\n",
|
|
1405
|
+
" for step in rec.action_steps:\n",
|
|
1406
|
+
" print(f\" • {step}\")\n",
|
|
1407
|
+
" \n",
|
|
1408
|
+
" if low_severity:\n",
|
|
1409
|
+
" print(\"\\n🟢 LOW PRIORITY (nice to have):\")\n",
|
|
1410
|
+
" print(\"-\" * 60)\n",
|
|
1411
|
+
" for rec in low_severity:\n",
|
|
1412
|
+
" print(f\"\\n 📌 {rec.column_name}\")\n",
|
|
1413
|
+
" print(f\" Issue: {rec.description}\")\n",
|
|
1414
|
+
" print(f\" Strategy: {rec.strategy_label}\")\n",
|
|
1415
|
+
" print(f\" Impact: {rec.problem_impact}\")\n",
|
|
1416
|
+
" \n",
|
|
1417
|
+
" # Persist cleaning recommendations to registry\n",
|
|
1418
|
+
" for rec in cleaning_recs:\n",
|
|
1419
|
+
" # Check if not already added by previous sections\n",
|
|
1420
|
+
" existing_null = [r for r in registry.bronze.null_handling if r.target_column == rec.column_name]\n",
|
|
1421
|
+
" existing_outlier = [r for r in registry.bronze.outlier_handling if r.target_column == rec.column_name]\n",
|
|
1422
|
+
" \n",
|
|
1423
|
+
" if rec.issue_type in [\"null_values\", \"missing_values\"] and not existing_null:\n",
|
|
1424
|
+
" strategy = \"median\" if \"median\" in rec.strategy.lower() else \"mode\" if \"mode\" in rec.strategy.lower() else \"drop\"\n",
|
|
1425
|
+
" registry.add_bronze_null(\n",
|
|
1426
|
+
" column=rec.column_name,\n",
|
|
1427
|
+
" strategy=strategy,\n",
|
|
1428
|
+
" rationale=rec.description,\n",
|
|
1429
|
+
" source_notebook=\"03_quality_assessment\"\n",
|
|
1430
|
+
" )\n",
|
|
1431
|
+
" elif rec.issue_type == \"outliers\" and not existing_outlier:\n",
|
|
1432
|
+
" registry.add_bronze_outlier(\n",
|
|
1433
|
+
" column=rec.column_name,\n",
|
|
1434
|
+
" action=\"winsorize\" if \"winsor\" in rec.strategy.lower() else \"cap\",\n",
|
|
1435
|
+
" parameters={\"severity\": rec.severity, \"affected_rows\": rec.affected_rows},\n",
|
|
1436
|
+
" rationale=rec.description,\n",
|
|
1437
|
+
" source_notebook=\"03_quality_assessment\"\n",
|
|
1438
|
+
" )\n",
|
|
1439
|
+
" \n",
|
|
1440
|
+
" # Summary table\n",
|
|
1441
|
+
" print(\"\\n\" + \"=\" * 80)\n",
|
|
1442
|
+
" print(\"CLEANUP SUMMARY\")\n",
|
|
1443
|
+
" print(\"=\" * 80)\n",
|
|
1444
|
+
" \n",
|
|
1445
|
+
" summary_data = []\n",
|
|
1446
|
+
" for rec in cleaning_recs:\n",
|
|
1447
|
+
" summary_data.append({\n",
|
|
1448
|
+
" \"Column\": rec.column_name,\n",
|
|
1449
|
+
" \"Issue\": rec.issue_type.replace(\"_\", \" \").title(),\n",
|
|
1450
|
+
" \"Severity\": rec.severity.upper(),\n",
|
|
1451
|
+
" \"Recommended Action\": rec.strategy_label,\n",
|
|
1452
|
+
" \"Affected Rows\": f\"{rec.affected_rows:,}\"\n",
|
|
1453
|
+
" })\n",
|
|
1454
|
+
" \n",
|
|
1455
|
+
" summary_df = pd.DataFrame(summary_data)\n",
|
|
1456
|
+
" display(summary_df)\n",
|
|
1457
|
+
" \n",
|
|
1458
|
+
" # Total impact\n",
|
|
1459
|
+
" total_affected = sum(r.affected_rows for r in cleaning_recs)\n",
|
|
1460
|
+
" unique_affected = min(total_affected, len(df)) # Can't exceed total rows\n",
|
|
1461
|
+
" print(f\"\\nTotal potentially affected: {total_affected:,} cell values\")\n",
|
|
1462
|
+
" print(f\"Columns needing attention: {len(cleaning_recs)}\")\n",
|
|
1463
|
+
"else:\n",
|
|
1464
|
+
" print(\"\\n✅ No cleaning recommendations - data quality is excellent!\")"
|
|
1465
|
+
]
|
|
1466
|
+
},
|
|
1467
|
+
{
|
|
1468
|
+
"cell_type": "markdown",
|
|
1469
|
+
"id": "b623adb2",
|
|
1470
|
+
"metadata": {
|
|
1471
|
+
"papermill": {
|
|
1472
|
+
"duration": 0.009966,
|
|
1473
|
+
"end_time": "2026-02-02T13:03:10.844511",
|
|
1474
|
+
"exception": false,
|
|
1475
|
+
"start_time": "2026-02-02T13:03:10.834545",
|
|
1476
|
+
"status": "completed"
|
|
1477
|
+
},
|
|
1478
|
+
"tags": []
|
|
1479
|
+
},
|
|
1480
|
+
"source": [
|
|
1481
|
+
"## 3.13 Save Updated Findings"
|
|
1482
|
+
]
|
|
1483
|
+
},
|
|
1484
|
+
{
|
|
1485
|
+
"cell_type": "code",
|
|
1486
|
+
"execution_count": null,
|
|
1487
|
+
"id": "d98d70d7",
|
|
1488
|
+
"metadata": {
|
|
1489
|
+
"execution": {
|
|
1490
|
+
"iopub.execute_input": "2026-02-02T13:03:10.893014Z",
|
|
1491
|
+
"iopub.status.busy": "2026-02-02T13:03:10.892883Z",
|
|
1492
|
+
"iopub.status.idle": "2026-02-02T13:03:10.987017Z",
|
|
1493
|
+
"shell.execute_reply": "2026-02-02T13:03:10.986626Z"
|
|
1494
|
+
},
|
|
1495
|
+
"papermill": {
|
|
1496
|
+
"duration": 0.116089,
|
|
1497
|
+
"end_time": "2026-02-02T13:03:10.987834",
|
|
1498
|
+
"exception": false,
|
|
1499
|
+
"start_time": "2026-02-02T13:03:10.871745",
|
|
1500
|
+
"status": "completed"
|
|
1501
|
+
},
|
|
1502
|
+
"tags": []
|
|
1503
|
+
},
|
|
1504
|
+
"outputs": [],
|
|
1505
|
+
"source": [
|
|
1506
|
+
"# Save updated findings back to the same file\n",
|
|
1507
|
+
"findings.save(FINDINGS_PATH)\n",
|
|
1508
|
+
"print(f\"Updated findings saved to: {FINDINGS_PATH}\")\n",
|
|
1509
|
+
"\n",
|
|
1510
|
+
"# Save recommendations registry\n",
|
|
1511
|
+
"registry.save(RECOMMENDATIONS_PATH)\n",
|
|
1512
|
+
"print(f\"Recommendations saved to: {RECOMMENDATIONS_PATH}\")\n",
|
|
1513
|
+
"\n",
|
|
1514
|
+
"# Summary of recommendations\n",
|
|
1515
|
+
"all_recs = registry.all_recommendations\n",
|
|
1516
|
+
"print(f\"\\n📋 Recommendations Summary:\")\n",
|
|
1517
|
+
"print(f\" Bronze layer: {len(registry.get_by_layer('bronze'))} recommendations\")\n",
|
|
1518
|
+
"if registry.silver:\n",
|
|
1519
|
+
" print(f\" Silver layer: {len(registry.get_by_layer('silver'))} recommendations\")\n",
|
|
1520
|
+
"if registry.gold:\n",
|
|
1521
|
+
" print(f\" Gold layer: {len(registry.get_by_layer('gold'))} recommendations\")\n",
|
|
1522
|
+
"print(f\" Total: {len(all_recs)} recommendations\")\n"
|
|
1523
|
+
]
|
|
1524
|
+
},
|
|
1525
|
+
{
|
|
1526
|
+
"cell_type": "markdown",
|
|
1527
|
+
"id": "278956b0",
|
|
1528
|
+
"metadata": {
|
|
1529
|
+
"papermill": {
|
|
1530
|
+
"duration": 0.008829,
|
|
1531
|
+
"end_time": "2026-02-02T13:03:11.005721",
|
|
1532
|
+
"exception": false,
|
|
1533
|
+
"start_time": "2026-02-02T13:03:10.996892",
|
|
1534
|
+
"status": "completed"
|
|
1535
|
+
},
|
|
1536
|
+
"tags": []
|
|
1537
|
+
},
|
|
1538
|
+
"source": [
|
|
1539
|
+
"---\n",
|
|
1540
|
+
"\n",
|
|
1541
|
+
"## Summary: What We Learned\n",
|
|
1542
|
+
"\n",
|
|
1543
|
+
"In this notebook, we performed a comprehensive quality assessment:\n",
|
|
1544
|
+
"\n",
|
|
1545
|
+
"1. **Duplicate Analysis** - Identified key-based duplicates, exact duplicates, and value conflicts\n",
|
|
1546
|
+
"2. **Target Variable** - Analyzed class distribution and imbalance for modeling guidance\n",
|
|
1547
|
+
"3. **Missing Values** - Analyzed patterns (MCAR/MAR/MNAR) and correlations\n",
|
|
1548
|
+
"4. **Segment-Aware Outliers** - Detected natural data segments to avoid false positive outliers\n",
|
|
1549
|
+
"5. **Global Outliers** - Detected using IQR method with bounds and percentages\n",
|
|
1550
|
+
"6. **Date Logic** - Validated temporal sequences and detected placeholders\n",
|
|
1551
|
+
"7. **Binary Fields** - Verified 0/1 encoding and distributions\n",
|
|
1552
|
+
"8. **Consistency** - Checked for case variants and spacing issues\n",
|
|
1553
|
+
"9. **Recommendations** - Generated automated cleaning strategies\n",
|
|
1554
|
+
"\n",
|
|
1555
|
+
"## Key Insights\n",
|
|
1556
|
+
"\n",
|
|
1557
|
+
"**Duplicate Analysis:**\n",
|
|
1558
|
+
"- Exact duplicates should be removed before modeling\n",
|
|
1559
|
+
"- Key duplicates with value conflicts require investigation and a resolution strategy\n",
|
|
1560
|
+
"- High duplicate percentages may indicate event-level data requiring aggregation\n",
|
|
1561
|
+
"\n",
|
|
1562
|
+
"**Segment-Aware Analysis:**\n",
|
|
1563
|
+
"- If segments were detected, some global \"outliers\" may actually be valid data from different customer segments\n",
|
|
1564
|
+
"- Enterprise vs retail customers, new vs established accounts often have legitimately different value ranges\n",
|
|
1565
|
+
"- Use segment-specific outlier treatment when recommended to preserve important patterns\n",
|
|
1566
|
+
"\n",
|
|
1567
|
+
"## Key Cleanup Actions for This Dataset\n",
|
|
1568
|
+
"\n",
|
|
1569
|
+
"Based on the analysis above:\n",
|
|
1570
|
+
"- **Duplicates**: Review key duplicates and resolve value conflicts\n",
|
|
1571
|
+
"- **Missing Values**: Low (0.06%) - can drop or impute with mode\n",
|
|
1572
|
+
"- **Outliers**: Check segment-aware analysis results - some may be false positives\n",
|
|
1573
|
+
"- **Date Issues**: Check for placeholder dates before lastorder\n",
|
|
1574
|
+
"- **Binary Fields**: All valid with 0/1 encoding\n",
|
|
1575
|
+
"\n",
|
|
1576
|
+
"---\n",
|
|
1577
|
+
"\n",
|
|
1578
|
+
"## Next Steps\n",
|
|
1579
|
+
"\n",
|
|
1580
|
+
"Continue to **04_relationship_analysis.ipynb** to:\n",
|
|
1581
|
+
"- Explore correlations between features\n",
|
|
1582
|
+
"- Analyze feature-target relationships\n",
|
|
1583
|
+
"- Identify potential feature interactions\n",
|
|
1584
|
+
"- Detect multicollinearity issues"
|
|
1585
|
+
]
|
|
1586
|
+
},
|
|
1587
|
+
{
|
|
1588
|
+
"cell_type": "markdown",
|
|
1589
|
+
"id": "81f5fb41",
|
|
1590
|
+
"metadata": {
|
|
1591
|
+
"papermill": {
|
|
1592
|
+
"duration": 0.008798,
|
|
1593
|
+
"end_time": "2026-02-02T13:03:11.022834",
|
|
1594
|
+
"exception": false,
|
|
1595
|
+
"start_time": "2026-02-02T13:03:11.014036",
|
|
1596
|
+
"status": "completed"
|
|
1597
|
+
},
|
|
1598
|
+
"tags": []
|
|
1599
|
+
},
|
|
1600
|
+
"source": [
|
|
1601
|
+
"> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
|
|
1602
|
+
"> The next notebook will automatically export this notebook's HTML documentation from the saved file."
|
|
1603
|
+
]
|
|
1604
|
+
}
|
|
1605
|
+
],
|
|
1606
|
+
"metadata": {
|
|
1607
|
+
"kernelspec": {
|
|
1608
|
+
"display_name": "Python 3",
|
|
1609
|
+
"language": "python",
|
|
1610
|
+
"name": "python3"
|
|
1611
|
+
},
|
|
1612
|
+
"language_info": {
|
|
1613
|
+
"codemirror_mode": {
|
|
1614
|
+
"name": "ipython",
|
|
1615
|
+
"version": 3
|
|
1616
|
+
},
|
|
1617
|
+
"file_extension": ".py",
|
|
1618
|
+
"mimetype": "text/x-python",
|
|
1619
|
+
"name": "python",
|
|
1620
|
+
"nbconvert_exporter": "python",
|
|
1621
|
+
"pygments_lexer": "ipython3",
|
|
1622
|
+
"version": "3.12.4"
|
|
1623
|
+
},
|
|
1624
|
+
"papermill": {
|
|
1625
|
+
"default_parameters": {},
|
|
1626
|
+
"duration": 6.749412,
|
|
1627
|
+
"end_time": "2026-02-02T13:03:13.648240",
|
|
1628
|
+
"environment_variables": {},
|
|
1629
|
+
"exception": null,
|
|
1630
|
+
"input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/03_quality_assessment.ipynb",
|
|
1631
|
+
"output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/03_quality_assessment.ipynb",
|
|
1632
|
+
"parameters": {},
|
|
1633
|
+
"start_time": "2026-02-02T13:03:06.898828",
|
|
1634
|
+
"version": "2.6.0"
|
|
1635
|
+
}
|
|
1636
|
+
},
|
|
1637
|
+
"nbformat": 4,
|
|
1638
|
+
"nbformat_minor": 5
|
|
1639
|
+
}
|