churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,1430 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "9359aa8a",
|
|
6
|
+
"metadata": {
|
|
7
|
+
"papermill": {
|
|
8
|
+
"duration": 0.003327,
|
|
9
|
+
"end_time": "2026-02-02T13:02:59.136885",
|
|
10
|
+
"exception": false,
|
|
11
|
+
"start_time": "2026-02-02T13:02:59.133558",
|
|
12
|
+
"status": "completed"
|
|
13
|
+
},
|
|
14
|
+
"tags": []
|
|
15
|
+
},
|
|
16
|
+
"source": [
|
|
17
|
+
"# Chapter 2: Column Deep Dive\n",
|
|
18
|
+
"\n",
|
|
19
|
+
"**Purpose:** Analyze each column in detail with distribution analysis, value validation, and transformation recommendations.\n",
|
|
20
|
+
"\n",
|
|
21
|
+
"**What you'll learn:**\n",
|
|
22
|
+
"- How to validate value ranges for different column types\n",
|
|
23
|
+
"- How to interpret distribution shapes (skewness, kurtosis)\n",
|
|
24
|
+
"- When and why to apply transformations (log, sqrt, capping)\n",
|
|
25
|
+
"- How to detect zero-inflation and handle it\n",
|
|
26
|
+
"\n",
|
|
27
|
+
"**Outputs:**\n",
|
|
28
|
+
"- Value range validation results\n",
|
|
29
|
+
"- Per-column distribution visualizations with statistics\n",
|
|
30
|
+
"- Skewness/kurtosis analysis with transformation recommendations\n",
|
|
31
|
+
"- Zero-inflation detection\n",
|
|
32
|
+
"- Type confirmation/override capability\n",
|
|
33
|
+
"- Updated exploration findings"
|
|
34
|
+
]
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"cell_type": "markdown",
|
|
38
|
+
"id": "2caa955f",
|
|
39
|
+
"metadata": {
|
|
40
|
+
"papermill": {
|
|
41
|
+
"duration": 0.002958,
|
|
42
|
+
"end_time": "2026-02-02T13:02:59.142413",
|
|
43
|
+
"exception": false,
|
|
44
|
+
"start_time": "2026-02-02T13:02:59.139455",
|
|
45
|
+
"status": "completed"
|
|
46
|
+
},
|
|
47
|
+
"tags": []
|
|
48
|
+
},
|
|
49
|
+
"source": [
|
|
50
|
+
"## 2.1 Load Previous Findings"
|
|
51
|
+
]
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"cell_type": "code",
|
|
55
|
+
"execution_count": null,
|
|
56
|
+
"id": "9dde1f33",
|
|
57
|
+
"metadata": {
|
|
58
|
+
"execution": {
|
|
59
|
+
"iopub.execute_input": "2026-02-02T13:02:59.148522Z",
|
|
60
|
+
"iopub.status.busy": "2026-02-02T13:02:59.148372Z",
|
|
61
|
+
"iopub.status.idle": "2026-02-02T13:03:00.961868Z",
|
|
62
|
+
"shell.execute_reply": "2026-02-02T13:03:00.961427Z"
|
|
63
|
+
},
|
|
64
|
+
"papermill": {
|
|
65
|
+
"duration": 1.817589,
|
|
66
|
+
"end_time": "2026-02-02T13:03:00.962754",
|
|
67
|
+
"exception": false,
|
|
68
|
+
"start_time": "2026-02-02T13:02:59.145165",
|
|
69
|
+
"status": "completed"
|
|
70
|
+
},
|
|
71
|
+
"tags": []
|
|
72
|
+
},
|
|
73
|
+
"outputs": [],
|
|
74
|
+
"source": [
|
|
75
|
+
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
76
|
+
"track_and_export_previous(\"02_column_deep_dive.ipynb\")\n",
|
|
77
|
+
"\n",
|
|
78
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationRegistry\n",
|
|
79
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table, console\n",
|
|
80
|
+
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
81
|
+
"from customer_retention.stages.profiling import (\n",
|
|
82
|
+
" DistributionAnalyzer, TransformationType,\n",
|
|
83
|
+
" TemporalAnalyzer, TemporalGranularity,\n",
|
|
84
|
+
" CategoricalDistributionAnalyzer, EncodingType\n",
|
|
85
|
+
")\n",
|
|
86
|
+
"from customer_retention.stages.validation import DataValidator, RuleGenerator\n",
|
|
87
|
+
"import pandas as pd\n",
|
|
88
|
+
"import numpy as np\n",
|
|
89
|
+
"from scipy import stats\n",
|
|
90
|
+
"import plotly.graph_objects as go\n",
|
|
91
|
+
"import plotly.express as px\n",
|
|
92
|
+
"from plotly.subplots import make_subplots\n",
|
|
93
|
+
"from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
|
|
94
|
+
]
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"cell_type": "code",
|
|
98
|
+
"execution_count": null,
|
|
99
|
+
"id": "18c54d15",
|
|
100
|
+
"metadata": {
|
|
101
|
+
"execution": {
|
|
102
|
+
"iopub.execute_input": "2026-02-02T13:03:00.968282Z",
|
|
103
|
+
"iopub.status.busy": "2026-02-02T13:03:00.968075Z",
|
|
104
|
+
"iopub.status.idle": "2026-02-02T13:03:01.104881Z",
|
|
105
|
+
"shell.execute_reply": "2026-02-02T13:03:01.104422Z"
|
|
106
|
+
},
|
|
107
|
+
"papermill": {
|
|
108
|
+
"duration": 0.140223,
|
|
109
|
+
"end_time": "2026-02-02T13:03:01.105512",
|
|
110
|
+
"exception": false,
|
|
111
|
+
"start_time": "2026-02-02T13:03:00.965289",
|
|
112
|
+
"status": "completed"
|
|
113
|
+
},
|
|
114
|
+
"tags": []
|
|
115
|
+
},
|
|
116
|
+
"outputs": [],
|
|
117
|
+
"source": [
|
|
118
|
+
"# === CONFIGURATION ===\n",
|
|
119
|
+
"# Option 1: Set the exact path from notebook 01 output\n",
|
|
120
|
+
"# FINDINGS_PATH = \"../experiments/findings/customer_retention_retail_abc123_findings.yaml\"\n",
|
|
121
|
+
"\n",
|
|
122
|
+
"# Option 2: Auto-discover findings file (prefers aggregated over event-level)\n",
|
|
123
|
+
"from pathlib import Path\n",
|
|
124
|
+
"import os\n",
|
|
125
|
+
"\n",
|
|
126
|
+
"# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
|
|
127
|
+
"\n",
|
|
128
|
+
"# Find all findings files\n",
|
|
129
|
+
"findings_files = [f for f in FINDINGS_DIR.glob(\"*_findings.yaml\") if \"multi_dataset\" not in f.name]\n",
|
|
130
|
+
"if not findings_files:\n",
|
|
131
|
+
" raise FileNotFoundError(f\"No findings files found in {FINDINGS_DIR}. Run notebook 01 first.\")\n",
|
|
132
|
+
"\n",
|
|
133
|
+
"# Prefer aggregated findings (from 01d) over event-level findings\n",
|
|
134
|
+
"# This ensures notebooks 02-10 work with entity-level data\n",
|
|
135
|
+
"# Pattern: *_aggregated_* in filename indicates aggregated data\n",
|
|
136
|
+
"aggregated_files = [f for f in findings_files if \"_aggregated\" in f.name]\n",
|
|
137
|
+
"non_aggregated_files = [f for f in findings_files if \"_aggregated\" not in f.name]\n",
|
|
138
|
+
"\n",
|
|
139
|
+
"if aggregated_files:\n",
|
|
140
|
+
" # Use most recent aggregated file\n",
|
|
141
|
+
" aggregated_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
142
|
+
" FINDINGS_PATH = str(aggregated_files[0])\n",
|
|
143
|
+
" print(f\"Found {len(aggregated_files)} aggregated findings file(s)\")\n",
|
|
144
|
+
" print(f\"Using: {FINDINGS_PATH}\")\n",
|
|
145
|
+
" if non_aggregated_files:\n",
|
|
146
|
+
" print(f\" (Skipping {len(non_aggregated_files)} event-level findings)\")\n",
|
|
147
|
+
"else:\n",
|
|
148
|
+
" # Fall back to most recent non-aggregated file\n",
|
|
149
|
+
" non_aggregated_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
150
|
+
" FINDINGS_PATH = str(non_aggregated_files[0])\n",
|
|
151
|
+
" print(f\"Found {len(non_aggregated_files)} findings file(s)\")\n",
|
|
152
|
+
" print(f\"Using: {FINDINGS_PATH}\")\n",
|
|
153
|
+
"\n",
|
|
154
|
+
"findings = ExplorationFindings.load(FINDINGS_PATH)\n",
|
|
155
|
+
"print(f\"\\nLoaded findings for {findings.column_count} columns from {findings.source_path}\")\n",
|
|
156
|
+
"\n",
|
|
157
|
+
"# Warn if this is event-level data (should run 01d first)\n",
|
|
158
|
+
"if findings.is_time_series and \"_aggregated\" not in FINDINGS_PATH:\n",
|
|
159
|
+
" ts_meta = findings.time_series_metadata\n",
|
|
160
|
+
" print(f\"\\n⚠️ WARNING: This appears to be EVENT-LEVEL data\")\n",
|
|
161
|
+
" print(f\" Entity: {ts_meta.entity_column}, Time: {ts_meta.time_column}\")\n",
|
|
162
|
+
" print(f\" Recommendation: Run 01d_event_aggregation.ipynb first to create entity-level data\")"
|
|
163
|
+
]
|
|
164
|
+
},
|
|
165
|
+
{
|
|
166
|
+
"cell_type": "markdown",
|
|
167
|
+
"id": "03167807",
|
|
168
|
+
"metadata": {
|
|
169
|
+
"papermill": {
|
|
170
|
+
"duration": 0.002163,
|
|
171
|
+
"end_time": "2026-02-02T13:03:01.110263",
|
|
172
|
+
"exception": false,
|
|
173
|
+
"start_time": "2026-02-02T13:03:01.108100",
|
|
174
|
+
"status": "completed"
|
|
175
|
+
},
|
|
176
|
+
"tags": []
|
|
177
|
+
},
|
|
178
|
+
"source": [
|
|
179
|
+
"## 2.2 Load Source Data"
|
|
180
|
+
]
|
|
181
|
+
},
|
|
182
|
+
{
|
|
183
|
+
"cell_type": "code",
|
|
184
|
+
"execution_count": null,
|
|
185
|
+
"id": "b55e4eac",
|
|
186
|
+
"metadata": {
|
|
187
|
+
"execution": {
|
|
188
|
+
"iopub.execute_input": "2026-02-02T13:03:01.115492Z",
|
|
189
|
+
"iopub.status.busy": "2026-02-02T13:03:01.115384Z",
|
|
190
|
+
"iopub.status.idle": "2026-02-02T13:03:01.212413Z",
|
|
191
|
+
"shell.execute_reply": "2026-02-02T13:03:01.211972Z"
|
|
192
|
+
},
|
|
193
|
+
"papermill": {
|
|
194
|
+
"duration": 0.100207,
|
|
195
|
+
"end_time": "2026-02-02T13:03:01.212881",
|
|
196
|
+
"exception": false,
|
|
197
|
+
"start_time": "2026-02-02T13:03:01.112674",
|
|
198
|
+
"status": "completed"
|
|
199
|
+
},
|
|
200
|
+
"tags": []
|
|
201
|
+
},
|
|
202
|
+
"outputs": [],
|
|
203
|
+
"source": [
|
|
204
|
+
"# Load data - handle aggregated parquet files directly\n",
|
|
205
|
+
"from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
|
|
206
|
+
"\n",
|
|
207
|
+
"# For aggregated data, load directly from the parquet source\n",
|
|
208
|
+
"if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
|
|
209
|
+
" source_path = Path(findings.source_path)\n",
|
|
210
|
+
" # Handle relative path from notebook directory\n",
|
|
211
|
+
" if not source_path.is_absolute():\n",
|
|
212
|
+
" source_path = Path(\"..\") / source_path.relative_to(\"..\") if str(source_path).startswith(\"..\") else FINDINGS_DIR / source_path.name\n",
|
|
213
|
+
" df = pd.read_parquet(source_path)\n",
|
|
214
|
+
" data_source = f\"aggregated:{source_path.name}\"\n",
|
|
215
|
+
"else:\n",
|
|
216
|
+
" # Standard loading for event-level or entity-level data\n",
|
|
217
|
+
" df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
|
|
218
|
+
"\n",
|
|
219
|
+
"print(f\"Loaded data from: {data_source}\")\n",
|
|
220
|
+
"print(f\"Shape: {df.shape}\")\n",
|
|
221
|
+
"\n",
|
|
222
|
+
"charts = ChartBuilder()\n",
|
|
223
|
+
"\n",
|
|
224
|
+
"# Initialize recommendation registry for this exploration\n",
|
|
225
|
+
"registry = RecommendationRegistry()\n",
|
|
226
|
+
"registry.init_bronze(findings.source_path)\n",
|
|
227
|
+
"\n",
|
|
228
|
+
"# Find target column for Gold layer initialization\n",
|
|
229
|
+
"target_col = next((name for name, col in findings.columns.items() if col.inferred_type == ColumnType.TARGET), None)\n",
|
|
230
|
+
"if target_col:\n",
|
|
231
|
+
" registry.init_gold(target_col)\n",
|
|
232
|
+
"\n",
|
|
233
|
+
"# Find entity column for Silver layer initialization\n",
|
|
234
|
+
"entity_col = next((name for name, col in findings.columns.items() if col.inferred_type == ColumnType.IDENTIFIER), None)\n",
|
|
235
|
+
"if entity_col:\n",
|
|
236
|
+
" registry.init_silver(entity_col)\n",
|
|
237
|
+
"\n",
|
|
238
|
+
"print(f\"Initialized recommendation registry (Bronze: {findings.source_path})\")"
|
|
239
|
+
]
|
|
240
|
+
},
|
|
241
|
+
{
|
|
242
|
+
"cell_type": "markdown",
|
|
243
|
+
"id": "fd863df6",
|
|
244
|
+
"metadata": {
|
|
245
|
+
"papermill": {
|
|
246
|
+
"duration": 0.002033,
|
|
247
|
+
"end_time": "2026-02-02T13:03:01.217382",
|
|
248
|
+
"exception": false,
|
|
249
|
+
"start_time": "2026-02-02T13:03:01.215349",
|
|
250
|
+
"status": "completed"
|
|
251
|
+
},
|
|
252
|
+
"tags": []
|
|
253
|
+
},
|
|
254
|
+
"source": [
|
|
255
|
+
"## 2.3 Value Range Validation\n",
|
|
256
|
+
"\n",
|
|
257
|
+
"**📖 Interpretation Guide:**\n",
|
|
258
|
+
"- **Percentage fields** (rates): Should be 0-100 or 0-1 depending on format\n",
|
|
259
|
+
"- **Binary fields**: Should only contain 0 and 1\n",
|
|
260
|
+
"- **Count fields**: Should be non-negative integers\n",
|
|
261
|
+
"- **Amount fields**: Should be non-negative (unless refunds are possible)\n",
|
|
262
|
+
"\n",
|
|
263
|
+
"**What to Watch For:**\n",
|
|
264
|
+
"- Rates > 100% suggest measurement or data entry errors\n",
|
|
265
|
+
"- Negative values in fields that should be positive\n",
|
|
266
|
+
"- Binary fields with values other than 0/1\n",
|
|
267
|
+
"\n",
|
|
268
|
+
"**Actions:**\n",
|
|
269
|
+
"- Cap rates at 100 if they exceed (or investigate cause)\n",
|
|
270
|
+
"- Flag records with impossible negative values\n",
|
|
271
|
+
"- Convert binary fields to proper 0/1 encoding"
|
|
272
|
+
]
|
|
273
|
+
},
|
|
274
|
+
{
|
|
275
|
+
"cell_type": "code",
|
|
276
|
+
"execution_count": null,
|
|
277
|
+
"id": "934c4565",
|
|
278
|
+
"metadata": {
|
|
279
|
+
"execution": {
|
|
280
|
+
"iopub.execute_input": "2026-02-02T13:03:01.221952Z",
|
|
281
|
+
"iopub.status.busy": "2026-02-02T13:03:01.221848Z",
|
|
282
|
+
"iopub.status.idle": "2026-02-02T13:03:01.230109Z",
|
|
283
|
+
"shell.execute_reply": "2026-02-02T13:03:01.229657Z"
|
|
284
|
+
},
|
|
285
|
+
"papermill": {
|
|
286
|
+
"duration": 0.011284,
|
|
287
|
+
"end_time": "2026-02-02T13:03:01.230585",
|
|
288
|
+
"exception": false,
|
|
289
|
+
"start_time": "2026-02-02T13:03:01.219301",
|
|
290
|
+
"status": "completed"
|
|
291
|
+
},
|
|
292
|
+
"tags": []
|
|
293
|
+
},
|
|
294
|
+
"outputs": [],
|
|
295
|
+
"source": [
|
|
296
|
+
"validator = DataValidator()\n",
|
|
297
|
+
"range_rules = RuleGenerator.from_findings(findings)\n",
|
|
298
|
+
"\n",
|
|
299
|
+
"console.start_section()\n",
|
|
300
|
+
"console.header(\"Value Range Validation\")\n",
|
|
301
|
+
"\n",
|
|
302
|
+
"if range_rules:\n",
|
|
303
|
+
" range_results = validator.validate_value_ranges(df, range_rules)\n",
|
|
304
|
+
" \n",
|
|
305
|
+
" issues_found = []\n",
|
|
306
|
+
" for r in range_results:\n",
|
|
307
|
+
" detail = f\"{r.invalid_values} invalid\" if r.invalid_values > 0 else None\n",
|
|
308
|
+
" console.check(f\"{r.column_name} ({r.rule_type})\", r.invalid_values == 0, detail)\n",
|
|
309
|
+
" if r.invalid_values > 0:\n",
|
|
310
|
+
" issues_found.append(r)\n",
|
|
311
|
+
" \n",
|
|
312
|
+
" all_invalid = sum(r.invalid_values for r in range_results)\n",
|
|
313
|
+
" if all_invalid == 0:\n",
|
|
314
|
+
" console.success(\"All value ranges valid\")\n",
|
|
315
|
+
" else:\n",
|
|
316
|
+
" console.error(f\"Found {all_invalid:,} values outside expected ranges\")\n",
|
|
317
|
+
" \n",
|
|
318
|
+
" console.info(\"Examples of invalid values:\")\n",
|
|
319
|
+
" for r in issues_found[:3]:\n",
|
|
320
|
+
" col = r.column_name\n",
|
|
321
|
+
" if col in df.columns:\n",
|
|
322
|
+
" if r.rule_type == 'binary':\n",
|
|
323
|
+
" invalid_mask = ~df[col].isin([0, 1, np.nan])\n",
|
|
324
|
+
" condition = \"value not in [0, 1]\"\n",
|
|
325
|
+
" elif r.rule_type == 'non_negative':\n",
|
|
326
|
+
" invalid_mask = df[col] < 0\n",
|
|
327
|
+
" condition = \"value < 0\"\n",
|
|
328
|
+
" elif r.rule_type == 'percentage':\n",
|
|
329
|
+
" invalid_mask = (df[col] < 0) | (df[col] > 100)\n",
|
|
330
|
+
" condition = \"value < 0 or value > 100\"\n",
|
|
331
|
+
" elif r.rule_type == 'rate':\n",
|
|
332
|
+
" invalid_mask = (df[col] < 0) | (df[col] > 1)\n",
|
|
333
|
+
" condition = \"value < 0 or value > 1\"\n",
|
|
334
|
+
" else:\n",
|
|
335
|
+
" continue\n",
|
|
336
|
+
" \n",
|
|
337
|
+
" invalid_values = df.loc[invalid_mask, col].dropna()\n",
|
|
338
|
+
" if len(invalid_values) > 0:\n",
|
|
339
|
+
" examples = invalid_values.head(5).tolist()\n",
|
|
340
|
+
" console.metric(f\" {col}\", f\"{examples}\")\n",
|
|
341
|
+
" \n",
|
|
342
|
+
" # Add filtering recommendation\n",
|
|
343
|
+
" registry.add_bronze_filtering(\n",
|
|
344
|
+
" column=col, condition=condition, action=\"cap\",\n",
|
|
345
|
+
" rationale=f\"{r.invalid_values} values violate {r.rule_type} constraint\",\n",
|
|
346
|
+
" source_notebook=\"02_column_deep_dive\"\n",
|
|
347
|
+
" )\n",
|
|
348
|
+
" \n",
|
|
349
|
+
" console.info(\"Rules auto-generated from detected column types\")\n",
|
|
350
|
+
"else:\n",
|
|
351
|
+
" range_results = []\n",
|
|
352
|
+
" console.info(\"No validation rules generated - no binary/numeric columns detected\")\n",
|
|
353
|
+
"\n",
|
|
354
|
+
"console.end_section()"
|
|
355
|
+
]
|
|
356
|
+
},
|
|
357
|
+
{
|
|
358
|
+
"cell_type": "markdown",
|
|
359
|
+
"id": "d39a7123",
|
|
360
|
+
"metadata": {
|
|
361
|
+
"papermill": {
|
|
362
|
+
"duration": 0.002152,
|
|
363
|
+
"end_time": "2026-02-02T13:03:01.235280",
|
|
364
|
+
"exception": false,
|
|
365
|
+
"start_time": "2026-02-02T13:03:01.233128",
|
|
366
|
+
"status": "completed"
|
|
367
|
+
},
|
|
368
|
+
"tags": []
|
|
369
|
+
},
|
|
370
|
+
"source": [
|
|
371
|
+
"## 2.4 Numeric Columns Analysis\n",
|
|
372
|
+
"\n",
|
|
373
|
+
"**📖 How to Interpret These Charts:**\n",
|
|
374
|
+
"- **Red dashed line** = Mean (sensitive to outliers)\n",
|
|
375
|
+
"- **Green solid line** = Median (robust to outliers)\n",
|
|
376
|
+
"- **Large gap between mean and median** = Skewed distribution\n",
|
|
377
|
+
"- **Long right tail** = Positive skew (common in count/amount data)\n",
|
|
378
|
+
"\n",
|
|
379
|
+
"**📖 Understanding Distribution Metrics**\n",
|
|
380
|
+
"\n",
|
|
381
|
+
"| Metric | Interpretation | Action |\n",
|
|
382
|
+
"|--------|---------------|--------|\n",
|
|
383
|
+
"| **Skewness** | Measures asymmetry | \\|skew\\| > 1: Consider log transform |\n",
|
|
384
|
+
"| **Kurtosis** | Measures tail heaviness | kurt > 10: Cap outliers before transform |\n",
|
|
385
|
+
"| **Zero %** | Percentage of zeros | > 40%: Use zero-inflation handling |\n",
|
|
386
|
+
"\n",
|
|
387
|
+
"**📖 Transformation Decision Tree:**\n",
|
|
388
|
+
"1. If zeros > 40% → Create binary indicator + log(non-zeros)\n",
|
|
389
|
+
"2. If \\|skewness\\| > 1 AND kurtosis > 10 → Cap then log\n",
|
|
390
|
+
"3. If \\|skewness\\| > 1 → Log transform\n",
|
|
391
|
+
"4. If kurtosis > 10 → Cap outliers only\n",
|
|
392
|
+
"5. Otherwise → Standard scaling is sufficient"
|
|
393
|
+
]
|
|
394
|
+
},
|
|
395
|
+
{
|
|
396
|
+
"cell_type": "code",
|
|
397
|
+
"execution_count": null,
|
|
398
|
+
"id": "13043429",
|
|
399
|
+
"metadata": {
|
|
400
|
+
"execution": {
|
|
401
|
+
"iopub.execute_input": "2026-02-02T13:03:01.240371Z",
|
|
402
|
+
"iopub.status.busy": "2026-02-02T13:03:01.240257Z",
|
|
403
|
+
"iopub.status.idle": "2026-02-02T13:03:02.330839Z",
|
|
404
|
+
"shell.execute_reply": "2026-02-02T13:03:02.330400Z"
|
|
405
|
+
},
|
|
406
|
+
"papermill": {
|
|
407
|
+
"duration": 1.093984,
|
|
408
|
+
"end_time": "2026-02-02T13:03:02.331518",
|
|
409
|
+
"exception": false,
|
|
410
|
+
"start_time": "2026-02-02T13:03:01.237534",
|
|
411
|
+
"status": "completed"
|
|
412
|
+
},
|
|
413
|
+
"tags": []
|
|
414
|
+
},
|
|
415
|
+
"outputs": [],
|
|
416
|
+
"source": [
|
|
417
|
+
"# Use framework's DistributionAnalyzer for comprehensive analysis\n",
|
|
418
|
+
"analyzer = DistributionAnalyzer()\n",
|
|
419
|
+
"\n",
|
|
420
|
+
"numeric_cols = [\n",
|
|
421
|
+
" name for name, col in findings.columns.items()\n",
|
|
422
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
423
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
424
|
+
"]\n",
|
|
425
|
+
"\n",
|
|
426
|
+
"# Analyze all numeric columns using the framework\n",
|
|
427
|
+
"analyses = analyzer.analyze_dataframe(df, numeric_cols)\n",
|
|
428
|
+
"recommendations = {col: analyzer.recommend_transformation(analysis) \n",
|
|
429
|
+
" for col, analysis in analyses.items()}\n",
|
|
430
|
+
"\n",
|
|
431
|
+
"for col_name in numeric_cols:\n",
|
|
432
|
+
" col_info = findings.columns[col_name]\n",
|
|
433
|
+
" analysis = analyses.get(col_name)\n",
|
|
434
|
+
" rec = recommendations.get(col_name)\n",
|
|
435
|
+
" \n",
|
|
436
|
+
" print(f\"\\n{'='*70}\")\n",
|
|
437
|
+
" print(f\"Column: {col_name}\")\n",
|
|
438
|
+
" print(f\"Type: {col_info.inferred_type.value} (Confidence: {col_info.confidence:.0%})\")\n",
|
|
439
|
+
" print(f\"-\" * 70)\n",
|
|
440
|
+
" \n",
|
|
441
|
+
" if analysis:\n",
|
|
442
|
+
" print(f\"📊 Distribution Statistics:\")\n",
|
|
443
|
+
" print(f\" Mean: {analysis.mean:.3f} | Median: {analysis.median:.3f} | Std: {analysis.std:.3f}\")\n",
|
|
444
|
+
" print(f\" Range: [{analysis.min_value:.3f}, {analysis.max_value:.3f}]\")\n",
|
|
445
|
+
" print(f\" Percentiles: 1%={analysis.percentiles['p1']:.3f}, 25%={analysis.q1:.3f}, 75%={analysis.q3:.3f}, 99%={analysis.percentiles['p99']:.3f}\")\n",
|
|
446
|
+
" print(f\"\\n📈 Shape Analysis:\")\n",
|
|
447
|
+
" skew_label = '(Right-skewed)' if analysis.skewness > 0.5 else '(Left-skewed)' if analysis.skewness < -0.5 else '(Symmetric)'\n",
|
|
448
|
+
" print(f\" Skewness: {analysis.skewness:.2f} {skew_label}\")\n",
|
|
449
|
+
" kurt_label = '(Heavy tails/outliers)' if analysis.kurtosis > 3 else '(Light tails)'\n",
|
|
450
|
+
" print(f\" Kurtosis: {analysis.kurtosis:.2f} {kurt_label}\")\n",
|
|
451
|
+
" print(f\" Zeros: {analysis.zero_count:,} ({analysis.zero_percentage:.1f}%)\")\n",
|
|
452
|
+
" print(f\" Outliers (IQR): {analysis.outlier_count_iqr:,} ({analysis.outlier_percentage:.1f}%)\")\n",
|
|
453
|
+
" \n",
|
|
454
|
+
" if rec:\n",
|
|
455
|
+
" print(f\"\\n🔧 Recommended Transformation: {rec.recommended_transform.value}\")\n",
|
|
456
|
+
" print(f\" Reason: {rec.reason}\")\n",
|
|
457
|
+
" print(f\" Priority: {rec.priority}\")\n",
|
|
458
|
+
" if rec.warnings:\n",
|
|
459
|
+
" for warn in rec.warnings:\n",
|
|
460
|
+
" print(f\" ⚠️ {warn}\")\n",
|
|
461
|
+
" \n",
|
|
462
|
+
" # Create enhanced histogram with Plotly\n",
|
|
463
|
+
" data = df[col_name].dropna()\n",
|
|
464
|
+
" fig = go.Figure()\n",
|
|
465
|
+
" \n",
|
|
466
|
+
" fig.add_trace(go.Histogram(x=data, nbinsx=50, name='Distribution',\n",
|
|
467
|
+
" marker_color='steelblue', opacity=0.7))\n",
|
|
468
|
+
" \n",
|
|
469
|
+
" # Calculate mean and median\n",
|
|
470
|
+
" mean_val = data.mean()\n",
|
|
471
|
+
" median_val = data.median()\n",
|
|
472
|
+
" \n",
|
|
473
|
+
" # Position labels on opposite sides (left/right) to avoid overlap\n",
|
|
474
|
+
" # The larger value gets right-justified, smaller gets left-justified\n",
|
|
475
|
+
" mean_position = \"top right\" if mean_val >= median_val else \"top left\"\n",
|
|
476
|
+
" median_position = \"top left\" if mean_val >= median_val else \"top right\"\n",
|
|
477
|
+
" \n",
|
|
478
|
+
" # Add mean line\n",
|
|
479
|
+
" fig.add_vline(\n",
|
|
480
|
+
" x=mean_val, \n",
|
|
481
|
+
" line_dash=\"dash\", \n",
|
|
482
|
+
" line_color=\"red\",\n",
|
|
483
|
+
" annotation_text=f\"Mean: {mean_val:.2f}\",\n",
|
|
484
|
+
" annotation_position=mean_position,\n",
|
|
485
|
+
" annotation_font_color=\"red\",\n",
|
|
486
|
+
" annotation_bgcolor=\"rgba(255,255,255,0.8)\"\n",
|
|
487
|
+
" )\n",
|
|
488
|
+
" \n",
|
|
489
|
+
" # Add median line\n",
|
|
490
|
+
" fig.add_vline(\n",
|
|
491
|
+
" x=median_val, \n",
|
|
492
|
+
" line_dash=\"solid\", \n",
|
|
493
|
+
" line_color=\"green\",\n",
|
|
494
|
+
" annotation_text=f\"Median: {median_val:.2f}\",\n",
|
|
495
|
+
" annotation_position=median_position,\n",
|
|
496
|
+
" annotation_font_color=\"green\",\n",
|
|
497
|
+
" annotation_bgcolor=\"rgba(255,255,255,0.8)\"\n",
|
|
498
|
+
" )\n",
|
|
499
|
+
" \n",
|
|
500
|
+
" # Add 99th percentile marker if there are outliers\n",
|
|
501
|
+
" if analysis and analysis.outlier_percentage > 5:\n",
|
|
502
|
+
" fig.add_vline(x=analysis.percentiles['p99'], line_dash=\"dot\", line_color=\"orange\",\n",
|
|
503
|
+
" annotation_text=f\"99th: {analysis.percentiles['p99']:.2f}\",\n",
|
|
504
|
+
" annotation_position=\"top right\",\n",
|
|
505
|
+
" annotation_font_color=\"orange\",\n",
|
|
506
|
+
" annotation_bgcolor=\"rgba(255,255,255,0.8)\")\n",
|
|
507
|
+
" \n",
|
|
508
|
+
" transform_label = rec.recommended_transform.value if rec else \"none\"\n",
|
|
509
|
+
" fig.update_layout(\n",
|
|
510
|
+
" title=f\"Distribution: {col_name}<br><sub>Skew: {analysis.skewness:.2f} | Kurt: {analysis.kurtosis:.2f} | Strategy: {transform_label}</sub>\",\n",
|
|
511
|
+
" xaxis_title=col_name,\n",
|
|
512
|
+
" yaxis_title=\"Count\",\n",
|
|
513
|
+
" template='plotly_white',\n",
|
|
514
|
+
" height=400\n",
|
|
515
|
+
" )\n",
|
|
516
|
+
" display_figure(fig)"
|
|
517
|
+
]
|
|
518
|
+
},
|
|
519
|
+
{
|
|
520
|
+
"cell_type": "code",
|
|
521
|
+
"execution_count": null,
|
|
522
|
+
"id": "674ea32d",
|
|
523
|
+
"metadata": {
|
|
524
|
+
"execution": {
|
|
525
|
+
"iopub.execute_input": "2026-02-02T13:03:02.532435Z",
|
|
526
|
+
"iopub.status.busy": "2026-02-02T13:03:02.532307Z",
|
|
527
|
+
"iopub.status.idle": "2026-02-02T13:03:02.620535Z",
|
|
528
|
+
"shell.execute_reply": "2026-02-02T13:03:02.620110Z"
|
|
529
|
+
},
|
|
530
|
+
"papermill": {
|
|
531
|
+
"duration": 0.189181,
|
|
532
|
+
"end_time": "2026-02-02T13:03:02.621233",
|
|
533
|
+
"exception": false,
|
|
534
|
+
"start_time": "2026-02-02T13:03:02.432052",
|
|
535
|
+
"status": "completed"
|
|
536
|
+
},
|
|
537
|
+
"tags": []
|
|
538
|
+
},
|
|
539
|
+
"outputs": [],
|
|
540
|
+
"source": [
|
|
541
|
+
"# Numerical Feature Statistics Table\n",
|
|
542
|
+
"if numeric_cols:\n",
|
|
543
|
+
" stats_data = []\n",
|
|
544
|
+
" for col_name in numeric_cols:\n",
|
|
545
|
+
" series = df[col_name].dropna()\n",
|
|
546
|
+
" if len(series) > 0:\n",
|
|
547
|
+
" stats_data.append({\n",
|
|
548
|
+
" \"feature\": col_name,\n",
|
|
549
|
+
" \"count\": len(series),\n",
|
|
550
|
+
" \"mean\": series.mean(),\n",
|
|
551
|
+
" \"std\": series.std(),\n",
|
|
552
|
+
" \"min\": series.min(),\n",
|
|
553
|
+
" \"25%\": series.quantile(0.25),\n",
|
|
554
|
+
" \"50%\": series.quantile(0.50),\n",
|
|
555
|
+
" \"75%\": series.quantile(0.75),\n",
|
|
556
|
+
" \"95%\": series.quantile(0.95),\n",
|
|
557
|
+
" \"99%\": series.quantile(0.99),\n",
|
|
558
|
+
" \"max\": series.max(),\n",
|
|
559
|
+
" \"skewness\": stats.skew(series),\n",
|
|
560
|
+
" \"kurtosis\": stats.kurtosis(series)\n",
|
|
561
|
+
" })\n",
|
|
562
|
+
" \n",
|
|
563
|
+
" stats_df = pd.DataFrame(stats_data)\n",
|
|
564
|
+
" \n",
|
|
565
|
+
" # Format for display\n",
|
|
566
|
+
" display_stats = stats_df.copy()\n",
|
|
567
|
+
" for col in [\"mean\", \"std\", \"min\", \"25%\", \"50%\", \"75%\", \"95%\", \"99%\", \"max\"]:\n",
|
|
568
|
+
" display_stats[col] = display_stats[col].apply(lambda x: f\"{x:.3f}\")\n",
|
|
569
|
+
" display_stats[\"skewness\"] = display_stats[\"skewness\"].apply(lambda x: f\"{x:.3f}\")\n",
|
|
570
|
+
" display_stats[\"kurtosis\"] = display_stats[\"kurtosis\"].apply(lambda x: f\"{x:.3f}\")\n",
|
|
571
|
+
" \n",
|
|
572
|
+
" print(\"=\" * 80)\n",
|
|
573
|
+
" print(\"NUMERICAL FEATURE STATISTICS\")\n",
|
|
574
|
+
" print(\"=\" * 80)\n",
|
|
575
|
+
" display(display_stats)"
|
|
576
|
+
]
|
|
577
|
+
},
|
|
578
|
+
{
|
|
579
|
+
"cell_type": "markdown",
|
|
580
|
+
"id": "b53f6e24",
|
|
581
|
+
"metadata": {
|
|
582
|
+
"papermill": {
|
|
583
|
+
"duration": 0.099217,
|
|
584
|
+
"end_time": "2026-02-02T13:03:02.857720",
|
|
585
|
+
"exception": false,
|
|
586
|
+
"start_time": "2026-02-02T13:03:02.758503",
|
|
587
|
+
"status": "completed"
|
|
588
|
+
},
|
|
589
|
+
"tags": []
|
|
590
|
+
},
|
|
591
|
+
"source": [
|
|
592
|
+
"## 2.5 Distribution Summary & Transformation Plan\n",
|
|
593
|
+
"\n",
|
|
594
|
+
"This table summarizes all numeric columns with their recommended transformations."
|
|
595
|
+
]
|
|
596
|
+
},
|
|
597
|
+
{
|
|
598
|
+
"cell_type": "code",
|
|
599
|
+
"execution_count": null,
|
|
600
|
+
"id": "af3319f6",
|
|
601
|
+
"metadata": {
|
|
602
|
+
"execution": {
|
|
603
|
+
"iopub.execute_input": "2026-02-02T13:03:03.057083Z",
|
|
604
|
+
"iopub.status.busy": "2026-02-02T13:03:03.056972Z",
|
|
605
|
+
"iopub.status.idle": "2026-02-02T13:03:03.063404Z",
|
|
606
|
+
"shell.execute_reply": "2026-02-02T13:03:03.062883Z"
|
|
607
|
+
},
|
|
608
|
+
"papermill": {
|
|
609
|
+
"duration": 0.108346,
|
|
610
|
+
"end_time": "2026-02-02T13:03:03.064068",
|
|
611
|
+
"exception": false,
|
|
612
|
+
"start_time": "2026-02-02T13:03:02.955722",
|
|
613
|
+
"status": "completed"
|
|
614
|
+
},
|
|
615
|
+
"tags": []
|
|
616
|
+
},
|
|
617
|
+
"outputs": [],
|
|
618
|
+
"source": [
|
|
619
|
+
"# Build transformation summary table\n",
|
|
620
|
+
"summary_data = []\n",
|
|
621
|
+
"for col_name in numeric_cols:\n",
|
|
622
|
+
" analysis = analyses.get(col_name)\n",
|
|
623
|
+
" rec = recommendations.get(col_name)\n",
|
|
624
|
+
" \n",
|
|
625
|
+
" if analysis and rec:\n",
|
|
626
|
+
" summary_data.append({\n",
|
|
627
|
+
" \"Column\": col_name,\n",
|
|
628
|
+
" \"Skewness\": f\"{analysis.skewness:.2f}\",\n",
|
|
629
|
+
" \"Kurtosis\": f\"{analysis.kurtosis:.2f}\",\n",
|
|
630
|
+
" \"Zeros %\": f\"{analysis.zero_percentage:.1f}%\",\n",
|
|
631
|
+
" \"Outliers %\": f\"{analysis.outlier_percentage:.1f}%\",\n",
|
|
632
|
+
" \"Transform\": rec.recommended_transform.value,\n",
|
|
633
|
+
" \"Priority\": rec.priority\n",
|
|
634
|
+
" })\n",
|
|
635
|
+
" \n",
|
|
636
|
+
" # Add Gold transformation recommendation if not \"none\"\n",
|
|
637
|
+
" if rec.recommended_transform != TransformationType.NONE and registry.gold:\n",
|
|
638
|
+
" registry.add_gold_transformation(\n",
|
|
639
|
+
" column=col_name,\n",
|
|
640
|
+
" transform=rec.recommended_transform.value,\n",
|
|
641
|
+
" parameters=rec.parameters,\n",
|
|
642
|
+
" rationale=rec.reason,\n",
|
|
643
|
+
" source_notebook=\"02_column_deep_dive\"\n",
|
|
644
|
+
" )\n",
|
|
645
|
+
"\n",
|
|
646
|
+
"if summary_data:\n",
|
|
647
|
+
" summary_df = pd.DataFrame(summary_data)\n",
|
|
648
|
+
" display_table(summary_df)\n",
|
|
649
|
+
" \n",
|
|
650
|
+
" # Show how many transformation recommendations were added\n",
|
|
651
|
+
" transform_count = sum(1 for r in recommendations.values() if r and r.recommended_transform != TransformationType.NONE)\n",
|
|
652
|
+
" if transform_count > 0 and registry.gold:\n",
|
|
653
|
+
" print(f\"\\n✅ Added {transform_count} transformation recommendations to Gold layer\")\n",
|
|
654
|
+
"else:\n",
|
|
655
|
+
" console.info(\"No numeric columns to summarize\")"
|
|
656
|
+
]
|
|
657
|
+
},
|
|
658
|
+
{
|
|
659
|
+
"cell_type": "markdown",
|
|
660
|
+
"id": "9e1f558b",
|
|
661
|
+
"metadata": {
|
|
662
|
+
"papermill": {
|
|
663
|
+
"duration": 0.098534,
|
|
664
|
+
"end_time": "2026-02-02T13:03:03.299429",
|
|
665
|
+
"exception": false,
|
|
666
|
+
"start_time": "2026-02-02T13:03:03.200895",
|
|
667
|
+
"status": "completed"
|
|
668
|
+
},
|
|
669
|
+
"tags": []
|
|
670
|
+
},
|
|
671
|
+
"source": [
|
|
672
|
+
"## 2.6 Categorical Columns Analysis\n",
|
|
673
|
+
"\n",
|
|
674
|
+
"**📖 Distribution Metrics (Analogues to Numeric Skewness/Kurtosis):**\n",
|
|
675
|
+
"\n",
|
|
676
|
+
"| Metric | Interpretation | Action |\n",
|
|
677
|
+
"|--------|---------------|--------|\n",
|
|
678
|
+
"| **Imbalance Ratio** | Largest / Smallest category count | > 10: Consider grouping rare categories |\n",
|
|
679
|
+
"| **Entropy** | Diversity measure (0 = one category, higher = more uniform) | Low entropy: May need stratified sampling |\n",
|
|
680
|
+
"| **Top-3 Concentration** | % of data in top 3 categories | > 90%: Rare categories may cause issues |\n",
|
|
681
|
+
"| **Rare Category %** | Categories with < 1% of data | High %: Group into \"Other\" category |\n",
|
|
682
|
+
"\n",
|
|
683
|
+
"**📖 Encoding Recommendations:**\n",
|
|
684
|
+
"- **Low cardinality (≤5)** → One-hot encoding\n",
|
|
685
|
+
"- **Medium cardinality (6-20)** → One-hot or Target encoding\n",
|
|
686
|
+
"- **High cardinality (>20)** → Target encoding or Frequency encoding\n",
|
|
687
|
+
"- **Cyclical (days, months)** → Sin/Cos encoding\n",
|
|
688
|
+
"\n",
|
|
689
|
+
"**⚠️ Common Issues:**\n",
|
|
690
|
+
"- Rare categories can cause overfitting with one-hot encoding\n",
|
|
691
|
+
"- High cardinality + one-hot = feature explosion\n",
|
|
692
|
+
"- Imbalanced categories may need special handling in train/test splits"
|
|
693
|
+
]
|
|
694
|
+
},
|
|
695
|
+
{
|
|
696
|
+
"cell_type": "code",
|
|
697
|
+
"execution_count": null,
|
|
698
|
+
"id": "740c5c5e",
|
|
699
|
+
"metadata": {
|
|
700
|
+
"execution": {
|
|
701
|
+
"iopub.execute_input": "2026-02-02T13:03:03.500506Z",
|
|
702
|
+
"iopub.status.busy": "2026-02-02T13:03:03.500396Z",
|
|
703
|
+
"iopub.status.idle": "2026-02-02T13:03:03.529538Z",
|
|
704
|
+
"shell.execute_reply": "2026-02-02T13:03:03.528970Z"
|
|
705
|
+
},
|
|
706
|
+
"papermill": {
|
|
707
|
+
"duration": 0.130148,
|
|
708
|
+
"end_time": "2026-02-02T13:03:03.530136",
|
|
709
|
+
"exception": false,
|
|
710
|
+
"start_time": "2026-02-02T13:03:03.399988",
|
|
711
|
+
"status": "completed"
|
|
712
|
+
},
|
|
713
|
+
"tags": []
|
|
714
|
+
},
|
|
715
|
+
"outputs": [],
|
|
716
|
+
"source": [
|
|
717
|
+
"# Use framework's CategoricalDistributionAnalyzer\n",
|
|
718
|
+
"cat_analyzer = CategoricalDistributionAnalyzer()\n",
|
|
719
|
+
"\n",
|
|
720
|
+
"categorical_cols = [\n",
|
|
721
|
+
" name for name, col in findings.columns.items()\n",
|
|
722
|
+
" if col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL, ColumnType.CATEGORICAL_CYCLICAL]\n",
|
|
723
|
+
" and col.inferred_type != ColumnType.TEXT # TEXT columns processed separately in 02a\n",
|
|
724
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
725
|
+
"]\n",
|
|
726
|
+
"\n",
|
|
727
|
+
"# Analyze all categorical columns\n",
|
|
728
|
+
"cat_analyses = cat_analyzer.analyze_dataframe(df, categorical_cols)\n",
|
|
729
|
+
"\n",
|
|
730
|
+
"# Get encoding recommendations\n",
|
|
731
|
+
"cyclical_cols = [name for name, col in findings.columns.items() \n",
|
|
732
|
+
" if col.inferred_type == ColumnType.CATEGORICAL_CYCLICAL]\n",
|
|
733
|
+
"cat_recommendations = cat_analyzer.get_all_recommendations(df, categorical_cols, cyclical_columns=cyclical_cols)\n",
|
|
734
|
+
"\n",
|
|
735
|
+
"for col_name in categorical_cols:\n",
|
|
736
|
+
" col_info = findings.columns[col_name]\n",
|
|
737
|
+
" analysis = cat_analyses.get(col_name)\n",
|
|
738
|
+
" rec = next((r for r in cat_recommendations if r.column_name == col_name), None)\n",
|
|
739
|
+
" \n",
|
|
740
|
+
" print(f\"\\n{'='*70}\")\n",
|
|
741
|
+
" print(f\"Column: {col_name}\")\n",
|
|
742
|
+
" print(f\"Type: {col_info.inferred_type.value} (Confidence: {col_info.confidence:.0%})\")\n",
|
|
743
|
+
" print(f\"-\" * 70)\n",
|
|
744
|
+
" \n",
|
|
745
|
+
" if analysis:\n",
|
|
746
|
+
" print(f\"\\n📊 Distribution Metrics:\")\n",
|
|
747
|
+
" print(f\" Categories: {analysis.category_count}\")\n",
|
|
748
|
+
" print(f\" Imbalance Ratio: {analysis.imbalance_ratio:.1f}x (largest/smallest)\")\n",
|
|
749
|
+
" print(f\" Entropy: {analysis.entropy:.2f} ({analysis.normalized_entropy*100:.0f}% of max)\")\n",
|
|
750
|
+
" print(f\" Top-1 Concentration: {analysis.top1_concentration:.1f}%\")\n",
|
|
751
|
+
" print(f\" Top-3 Concentration: {analysis.top3_concentration:.1f}%\")\n",
|
|
752
|
+
" print(f\" Rare Categories (<1%): {analysis.rare_category_count}\")\n",
|
|
753
|
+
" \n",
|
|
754
|
+
" # Interpretation\n",
|
|
755
|
+
" print(f\"\\n📈 Interpretation:\")\n",
|
|
756
|
+
" if analysis.has_low_diversity:\n",
|
|
757
|
+
" print(f\" ⚠️ LOW DIVERSITY: Distribution dominated by few categories\")\n",
|
|
758
|
+
" elif analysis.normalized_entropy > 0.9:\n",
|
|
759
|
+
" print(f\" ✓ HIGH DIVERSITY: Categories are relatively balanced\")\n",
|
|
760
|
+
" else:\n",
|
|
761
|
+
" print(f\" ✓ MODERATE DIVERSITY: Some category dominance but acceptable\")\n",
|
|
762
|
+
" \n",
|
|
763
|
+
" if analysis.imbalance_ratio > 100:\n",
|
|
764
|
+
" print(f\" 🔴 SEVERE IMBALANCE: Rarest category has very few samples\")\n",
|
|
765
|
+
" elif analysis.is_imbalanced:\n",
|
|
766
|
+
" print(f\" 🟡 MODERATE IMBALANCE: Consider grouping rare categories\")\n",
|
|
767
|
+
" \n",
|
|
768
|
+
" # Recommendations\n",
|
|
769
|
+
" if rec:\n",
|
|
770
|
+
" print(f\"\\n🔧 Recommendations:\")\n",
|
|
771
|
+
" print(f\" Encoding: {rec.encoding_type.value}\")\n",
|
|
772
|
+
" print(f\" Reason: {rec.reason}\")\n",
|
|
773
|
+
" print(f\" Priority: {rec.priority}\")\n",
|
|
774
|
+
" \n",
|
|
775
|
+
" if rec.preprocessing_steps:\n",
|
|
776
|
+
" print(f\" Preprocessing:\")\n",
|
|
777
|
+
" for step in rec.preprocessing_steps:\n",
|
|
778
|
+
" print(f\" • {step}\")\n",
|
|
779
|
+
" \n",
|
|
780
|
+
" if rec.warnings:\n",
|
|
781
|
+
" for warn in rec.warnings:\n",
|
|
782
|
+
" print(f\" ⚠️ {warn}\")\n",
|
|
783
|
+
" \n",
|
|
784
|
+
" # Visualization\n",
|
|
785
|
+
" value_counts = df[col_name].value_counts()\n",
|
|
786
|
+
" subtitle = f\"Entropy: {analysis.normalized_entropy*100:.0f}% | Imbalance: {analysis.imbalance_ratio:.1f}x | Rare: {analysis.rare_category_count}\" if analysis else \"\"\n",
|
|
787
|
+
" fig = charts.bar_chart(\n",
|
|
788
|
+
" value_counts.head(10).index.tolist(), \n",
|
|
789
|
+
" value_counts.head(10).values.tolist(),\n",
|
|
790
|
+
" title=f\"Top Categories: {col_name}<br><sub>{subtitle}</sub>\"\n",
|
|
791
|
+
" )\n",
|
|
792
|
+
" display_figure(fig)\n",
|
|
793
|
+
"\n",
|
|
794
|
+
"# Summary table and add recommendations to registry\n",
|
|
795
|
+
"if cat_analyses:\n",
|
|
796
|
+
" print(\"\\n\" + \"=\" * 70)\n",
|
|
797
|
+
" print(\"CATEGORICAL COLUMNS SUMMARY\")\n",
|
|
798
|
+
" print(\"=\" * 70)\n",
|
|
799
|
+
" summary_data = []\n",
|
|
800
|
+
" for col_name, analysis in cat_analyses.items():\n",
|
|
801
|
+
" rec = next((r for r in cat_recommendations if r.column_name == col_name), None)\n",
|
|
802
|
+
" summary_data.append({\n",
|
|
803
|
+
" \"Column\": col_name,\n",
|
|
804
|
+
" \"Categories\": analysis.category_count,\n",
|
|
805
|
+
" \"Imbalance\": f\"{analysis.imbalance_ratio:.1f}x\",\n",
|
|
806
|
+
" \"Entropy\": f\"{analysis.normalized_entropy*100:.0f}%\",\n",
|
|
807
|
+
" \"Top-3 Conc.\": f\"{analysis.top3_concentration:.1f}%\",\n",
|
|
808
|
+
" \"Rare (<1%)\": analysis.rare_category_count,\n",
|
|
809
|
+
" \"Encoding\": rec.encoding_type.value if rec else \"N/A\"\n",
|
|
810
|
+
" })\n",
|
|
811
|
+
" \n",
|
|
812
|
+
" # Add encoding recommendation to Gold layer\n",
|
|
813
|
+
" if rec and registry.gold:\n",
|
|
814
|
+
" registry.add_gold_encoding(\n",
|
|
815
|
+
" column=col_name,\n",
|
|
816
|
+
" method=rec.encoding_type.value,\n",
|
|
817
|
+
" rationale=rec.reason,\n",
|
|
818
|
+
" source_notebook=\"02_column_deep_dive\"\n",
|
|
819
|
+
" )\n",
|
|
820
|
+
" \n",
|
|
821
|
+
" display_table(pd.DataFrame(summary_data))\n",
|
|
822
|
+
" \n",
|
|
823
|
+
" if registry.gold:\n",
|
|
824
|
+
" print(f\"\\n✅ Added {len(cat_recommendations)} encoding recommendations to Gold layer\")"
|
|
825
|
+
]
|
|
826
|
+
},
|
|
827
|
+
{
|
|
828
|
+
"cell_type": "markdown",
|
|
829
|
+
"id": "399d9419",
|
|
830
|
+
"metadata": {
|
|
831
|
+
"papermill": {
|
|
832
|
+
"duration": 0.101332,
|
|
833
|
+
"end_time": "2026-02-02T13:03:03.769208",
|
|
834
|
+
"exception": false,
|
|
835
|
+
"start_time": "2026-02-02T13:03:03.667876",
|
|
836
|
+
"status": "completed"
|
|
837
|
+
},
|
|
838
|
+
"tags": []
|
|
839
|
+
},
|
|
840
|
+
"source": [
|
|
841
|
+
"## 2.7 Datetime Columns Analysis\n",
|
|
842
|
+
"\n",
|
|
843
|
+
"**📖 Unlike numeric transformations, datetime analysis recommends NEW FEATURES to create:**\n",
|
|
844
|
+
"\n",
|
|
845
|
+
"| Recommendation Type | Purpose | Examples |\n",
|
|
846
|
+
"|---------------------|---------|----------|\n",
|
|
847
|
+
"| **Feature Engineering** | Create predictive features from dates | `days_since_signup`, `tenure_years`, `month_sin_cos` |\n",
|
|
848
|
+
"| **Modeling Strategy** | How to structure train/test | Time-based splits when trends detected |\n",
|
|
849
|
+
"| **Data Quality** | Issues to address before modeling | Placeholder dates (1/1/1900) to filter |\n",
|
|
850
|
+
"\n",
|
|
851
|
+
"**📖 Feature Engineering Strategies:**\n",
|
|
852
|
+
"- **Recency**: `days_since_X` - How recent was the event? (useful for predicting behavior)\n",
|
|
853
|
+
"- **Tenure**: `tenure_years` - How long has customer been active? (maturity/loyalty)\n",
|
|
854
|
+
"- **Duration**: `days_between_A_and_B` - Time between events (e.g., signup to first purchase)\n",
|
|
855
|
+
"- **Cyclical**: `month_sin`, `month_cos` - Preserves that December is near January\n",
|
|
856
|
+
"- **Categorical**: `is_weekend`, `is_quarter_end` - Behavioral indicators"
|
|
857
|
+
]
|
|
858
|
+
},
|
|
859
|
+
{
|
|
860
|
+
"cell_type": "code",
|
|
861
|
+
"execution_count": null,
|
|
862
|
+
"id": "c1642bb7",
|
|
863
|
+
"metadata": {
|
|
864
|
+
"execution": {
|
|
865
|
+
"iopub.execute_input": "2026-02-02T13:03:03.974921Z",
|
|
866
|
+
"iopub.status.busy": "2026-02-02T13:03:03.974803Z",
|
|
867
|
+
"iopub.status.idle": "2026-02-02T13:03:03.984910Z",
|
|
868
|
+
"shell.execute_reply": "2026-02-02T13:03:03.984448Z"
|
|
869
|
+
},
|
|
870
|
+
"papermill": {
|
|
871
|
+
"duration": 0.113413,
|
|
872
|
+
"end_time": "2026-02-02T13:03:03.985554",
|
|
873
|
+
"exception": false,
|
|
874
|
+
"start_time": "2026-02-02T13:03:03.872141",
|
|
875
|
+
"status": "completed"
|
|
876
|
+
},
|
|
877
|
+
"tags": []
|
|
878
|
+
},
|
|
879
|
+
"outputs": [],
|
|
880
|
+
"source": [
|
|
881
|
+
"from customer_retention.stages.profiling.temporal_analyzer import TemporalRecommendationType\n",
|
|
882
|
+
"\n",
|
|
883
|
+
"datetime_cols = [\n",
|
|
884
|
+
" name for name, col in findings.columns.items()\n",
|
|
885
|
+
" if col.inferred_type == ColumnType.DATETIME\n",
|
|
886
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
887
|
+
"]\n",
|
|
888
|
+
"\n",
|
|
889
|
+
"temporal_analyzer = TemporalAnalyzer()\n",
|
|
890
|
+
"\n",
|
|
891
|
+
"# Store all datetime recommendations grouped by type\n",
|
|
892
|
+
"feature_engineering_recs = []\n",
|
|
893
|
+
"modeling_strategy_recs = []\n",
|
|
894
|
+
"data_quality_recs = []\n",
|
|
895
|
+
"datetime_summaries = []\n",
|
|
896
|
+
"\n",
|
|
897
|
+
"for col_name in datetime_cols:\n",
|
|
898
|
+
" col_info = findings.columns[col_name]\n",
|
|
899
|
+
" print(f\"\\n{'='*70}\")\n",
|
|
900
|
+
" print(f\"Column: {col_name}\")\n",
|
|
901
|
+
" print(f\"Type: {col_info.inferred_type.value} (Confidence: {col_info.confidence:.0%})\")\n",
|
|
902
|
+
" print(f\"{'='*70}\")\n",
|
|
903
|
+
" \n",
|
|
904
|
+
" date_series = pd.to_datetime(df[col_name], errors='coerce', format='mixed')\n",
|
|
905
|
+
" valid_dates = date_series.dropna()\n",
|
|
906
|
+
" \n",
|
|
907
|
+
" print(f\"\\n📅 Date Range: {valid_dates.min()} to {valid_dates.max()}\")\n",
|
|
908
|
+
" print(f\" Nulls: {date_series.isna().sum():,} ({date_series.isna().mean()*100:.1f}%)\")\n",
|
|
909
|
+
" \n",
|
|
910
|
+
" # Basic temporal analysis\n",
|
|
911
|
+
" analysis = temporal_analyzer.analyze(date_series)\n",
|
|
912
|
+
" print(f\" Auto-detected granularity: {analysis.granularity.value}\")\n",
|
|
913
|
+
" print(f\" Span: {analysis.span_days:,} days ({analysis.span_days/365:.1f} years)\")\n",
|
|
914
|
+
" \n",
|
|
915
|
+
" # Growth analysis\n",
|
|
916
|
+
" growth = temporal_analyzer.calculate_growth_rate(date_series)\n",
|
|
917
|
+
" if growth.get(\"has_data\"):\n",
|
|
918
|
+
" print(f\"\\n📈 Growth Analysis:\")\n",
|
|
919
|
+
" print(f\" Trend: {growth['trend_direction'].upper()}\")\n",
|
|
920
|
+
" print(f\" Overall growth: {growth['overall_growth_pct']:+.1f}%\")\n",
|
|
921
|
+
" print(f\" Avg monthly growth: {growth['avg_monthly_growth']:+.1f}%\")\n",
|
|
922
|
+
" \n",
|
|
923
|
+
" # Seasonality analysis\n",
|
|
924
|
+
" seasonality = temporal_analyzer.analyze_seasonality(date_series)\n",
|
|
925
|
+
" if seasonality.has_seasonality:\n",
|
|
926
|
+
" print(f\"\\n🔄 Seasonality Detected:\")\n",
|
|
927
|
+
" print(f\" Peak months: {', '.join(seasonality.peak_periods[:3])}\")\n",
|
|
928
|
+
" print(f\" Trough months: {', '.join(seasonality.trough_periods[:3])}\")\n",
|
|
929
|
+
" print(f\" Seasonal strength: {seasonality.seasonal_strength:.2f}\")\n",
|
|
930
|
+
" \n",
|
|
931
|
+
" # Get recommendations using framework\n",
|
|
932
|
+
" other_dates = [c for c in datetime_cols if c != col_name]\n",
|
|
933
|
+
" recommendations = temporal_analyzer.recommend_features(date_series, col_name, other_date_columns=other_dates)\n",
|
|
934
|
+
" \n",
|
|
935
|
+
" # Group by recommendation type\n",
|
|
936
|
+
" col_feature_recs = [r for r in recommendations if r.recommendation_type == TemporalRecommendationType.FEATURE_ENGINEERING]\n",
|
|
937
|
+
" col_modeling_recs = [r for r in recommendations if r.recommendation_type == TemporalRecommendationType.MODELING_STRATEGY]\n",
|
|
938
|
+
" col_quality_recs = [r for r in recommendations if r.recommendation_type == TemporalRecommendationType.DATA_QUALITY]\n",
|
|
939
|
+
" \n",
|
|
940
|
+
" feature_engineering_recs.extend(col_feature_recs)\n",
|
|
941
|
+
" modeling_strategy_recs.extend(col_modeling_recs)\n",
|
|
942
|
+
" data_quality_recs.extend(col_quality_recs)\n",
|
|
943
|
+
" \n",
|
|
944
|
+
" # Display recommendations grouped by type\n",
|
|
945
|
+
" if col_feature_recs:\n",
|
|
946
|
+
" print(f\"\\n🛠️ FEATURES TO CREATE:\")\n",
|
|
947
|
+
" for rec in col_feature_recs:\n",
|
|
948
|
+
" priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"✓\"\n",
|
|
949
|
+
" print(f\" {priority_icon} {rec.feature_name} ({rec.category})\")\n",
|
|
950
|
+
" print(f\" Why: {rec.reason}\")\n",
|
|
951
|
+
" if rec.code_hint:\n",
|
|
952
|
+
" print(f\" Code: {rec.code_hint}\")\n",
|
|
953
|
+
" \n",
|
|
954
|
+
" if col_modeling_recs:\n",
|
|
955
|
+
" print(f\"\\n⚙️ MODELING CONSIDERATIONS:\")\n",
|
|
956
|
+
" for rec in col_modeling_recs:\n",
|
|
957
|
+
" priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"✓\"\n",
|
|
958
|
+
" print(f\" {priority_icon} {rec.feature_name}\")\n",
|
|
959
|
+
" print(f\" Why: {rec.reason}\")\n",
|
|
960
|
+
" \n",
|
|
961
|
+
" if col_quality_recs:\n",
|
|
962
|
+
" print(f\"\\n⚠️ DATA QUALITY ISSUES:\")\n",
|
|
963
|
+
" for rec in col_quality_recs:\n",
|
|
964
|
+
" priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"✓\"\n",
|
|
965
|
+
" print(f\" {priority_icon} {rec.feature_name}\")\n",
|
|
966
|
+
" print(f\" Why: {rec.reason}\")\n",
|
|
967
|
+
" if rec.code_hint:\n",
|
|
968
|
+
" print(f\" Code: {rec.code_hint}\")\n",
|
|
969
|
+
" \n",
|
|
970
|
+
" # Standard extractions always available\n",
|
|
971
|
+
" print(f\"\\n Standard extractions available: year, month, day, day_of_week, quarter\")\n",
|
|
972
|
+
" \n",
|
|
973
|
+
" # Store summary\n",
|
|
974
|
+
" datetime_summaries.append({\n",
|
|
975
|
+
" \"Column\": col_name,\n",
|
|
976
|
+
" \"Span (days)\": analysis.span_days,\n",
|
|
977
|
+
" \"Seasonality\": \"Yes\" if seasonality.has_seasonality else \"No\",\n",
|
|
978
|
+
" \"Trend\": growth.get('trend_direction', 'N/A').capitalize() if growth.get(\"has_data\") else \"N/A\",\n",
|
|
979
|
+
" \"Features to Create\": len(col_feature_recs),\n",
|
|
980
|
+
" \"Modeling Notes\": len(col_modeling_recs),\n",
|
|
981
|
+
" \"Quality Issues\": len(col_quality_recs)\n",
|
|
982
|
+
" })\n",
|
|
983
|
+
" \n",
|
|
984
|
+
" # === VISUALIZATIONS ===\n",
|
|
985
|
+
" \n",
|
|
986
|
+
" if growth.get(\"has_data\"):\n",
|
|
987
|
+
" fig = charts.growth_summary_indicators(growth, title=f\"Growth Summary: {col_name}\")\n",
|
|
988
|
+
" display_figure(fig)\n",
|
|
989
|
+
" \n",
|
|
990
|
+
" chart_type = \"line\" if analysis.granularity in [TemporalGranularity.DAY, TemporalGranularity.WEEK] else \"bar\"\n",
|
|
991
|
+
" fig = charts.temporal_distribution(analysis, title=f\"Records Over Time: {col_name}\", chart_type=chart_type)\n",
|
|
992
|
+
" display_figure(fig)\n",
|
|
993
|
+
" \n",
|
|
994
|
+
" fig = charts.temporal_trend(analysis, title=f\"Trend Analysis: {col_name}\")\n",
|
|
995
|
+
" display_figure(fig)\n",
|
|
996
|
+
" \n",
|
|
997
|
+
" yoy_data = temporal_analyzer.year_over_year_comparison(date_series)\n",
|
|
998
|
+
" if len(yoy_data) > 1:\n",
|
|
999
|
+
" fig = charts.year_over_year_lines(yoy_data, title=f\"Year-over-Year: {col_name}\")\n",
|
|
1000
|
+
" display_figure(fig)\n",
|
|
1001
|
+
" fig = charts.year_month_heatmap(yoy_data, title=f\"Records Heatmap: {col_name}\")\n",
|
|
1002
|
+
" display_figure(fig)\n",
|
|
1003
|
+
" \n",
|
|
1004
|
+
" if growth.get(\"has_data\"):\n",
|
|
1005
|
+
" fig = charts.cumulative_growth_chart(growth[\"cumulative\"], title=f\"Cumulative Records: {col_name}\")\n",
|
|
1006
|
+
" display_figure(fig)\n",
|
|
1007
|
+
" \n",
|
|
1008
|
+
" fig = charts.temporal_heatmap(date_series, title=f\"Day of Week Distribution: {col_name}\")\n",
|
|
1009
|
+
" display_figure(fig)\n",
|
|
1010
|
+
"\n",
|
|
1011
|
+
"# === DATETIME SUMMARY ===\n",
|
|
1012
|
+
"if datetime_summaries:\n",
|
|
1013
|
+
" print(\"\\n\" + \"=\" * 70)\n",
|
|
1014
|
+
" print(\"DATETIME COLUMNS SUMMARY\")\n",
|
|
1015
|
+
" print(\"=\" * 70)\n",
|
|
1016
|
+
" display_table(pd.DataFrame(datetime_summaries))\n",
|
|
1017
|
+
" \n",
|
|
1018
|
+
" # Summary by recommendation type\n",
|
|
1019
|
+
" print(\"\\n📋 ALL RECOMMENDATIONS BY TYPE:\")\n",
|
|
1020
|
+
" \n",
|
|
1021
|
+
" if feature_engineering_recs:\n",
|
|
1022
|
+
" print(f\"\\n🛠️ FEATURES TO CREATE ({len(feature_engineering_recs)}):\")\n",
|
|
1023
|
+
" for i, rec in enumerate(feature_engineering_recs, 1):\n",
|
|
1024
|
+
" priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"✓\"\n",
|
|
1025
|
+
" print(f\" {i}. {priority_icon} {rec.feature_name}\")\n",
|
|
1026
|
+
" \n",
|
|
1027
|
+
" if modeling_strategy_recs:\n",
|
|
1028
|
+
" print(f\"\\n⚙️ MODELING CONSIDERATIONS ({len(modeling_strategy_recs)}):\")\n",
|
|
1029
|
+
" for i, rec in enumerate(modeling_strategy_recs, 1):\n",
|
|
1030
|
+
" priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"✓\"\n",
|
|
1031
|
+
" print(f\" {i}. {priority_icon} {rec.feature_name}: {rec.reason}\")\n",
|
|
1032
|
+
" \n",
|
|
1033
|
+
" if data_quality_recs:\n",
|
|
1034
|
+
" print(f\"\\n⚠️ DATA QUALITY TO ADDRESS ({len(data_quality_recs)}):\")\n",
|
|
1035
|
+
" for i, rec in enumerate(data_quality_recs, 1):\n",
|
|
1036
|
+
" priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"✓\"\n",
|
|
1037
|
+
" print(f\" {i}. {priority_icon} {rec.feature_name}: {rec.reason}\")\n",
|
|
1038
|
+
" \n",
|
|
1039
|
+
" # Add recommendations to registry\n",
|
|
1040
|
+
" added_derived = 0\n",
|
|
1041
|
+
" added_modeling = 0\n",
|
|
1042
|
+
" \n",
|
|
1043
|
+
" # Add feature engineering recommendations to Silver layer (derived columns)\n",
|
|
1044
|
+
" if registry.silver:\n",
|
|
1045
|
+
" for rec in feature_engineering_recs:\n",
|
|
1046
|
+
" registry.add_silver_derived(\n",
|
|
1047
|
+
" column=rec.feature_name,\n",
|
|
1048
|
+
" expression=rec.code_hint or \"\",\n",
|
|
1049
|
+
" feature_type=rec.category,\n",
|
|
1050
|
+
" rationale=rec.reason,\n",
|
|
1051
|
+
" source_notebook=\"02_column_deep_dive\"\n",
|
|
1052
|
+
" )\n",
|
|
1053
|
+
" added_derived += 1\n",
|
|
1054
|
+
" \n",
|
|
1055
|
+
" # Add modeling strategy recommendations to Bronze layer\n",
|
|
1056
|
+
" seen_strategies = set()\n",
|
|
1057
|
+
" for rec in modeling_strategy_recs:\n",
|
|
1058
|
+
" if rec.feature_name not in seen_strategies:\n",
|
|
1059
|
+
" registry.add_bronze_modeling_strategy(\n",
|
|
1060
|
+
" strategy=rec.feature_name,\n",
|
|
1061
|
+
" column=datetime_cols[0] if datetime_cols else \"\",\n",
|
|
1062
|
+
" parameters={\"category\": rec.category},\n",
|
|
1063
|
+
" rationale=rec.reason,\n",
|
|
1064
|
+
" source_notebook=\"02_column_deep_dive\"\n",
|
|
1065
|
+
" )\n",
|
|
1066
|
+
" seen_strategies.add(rec.feature_name)\n",
|
|
1067
|
+
" added_modeling += 1\n",
|
|
1068
|
+
" \n",
|
|
1069
|
+
" print(f\"\\n✅ Added {added_derived} derived column recommendations to Silver layer\")\n",
|
|
1070
|
+
" print(f\"✅ Added {added_modeling} modeling strategy recommendations to Bronze layer\")"
|
|
1071
|
+
]
|
|
1072
|
+
},
|
|
1073
|
+
{
|
|
1074
|
+
"cell_type": "markdown",
|
|
1075
|
+
"id": "21b21acc",
|
|
1076
|
+
"metadata": {
|
|
1077
|
+
"papermill": {
|
|
1078
|
+
"duration": 0.139274,
|
|
1079
|
+
"end_time": "2026-02-02T13:03:04.226289",
|
|
1080
|
+
"exception": false,
|
|
1081
|
+
"start_time": "2026-02-02T13:03:04.087015",
|
|
1082
|
+
"status": "completed"
|
|
1083
|
+
},
|
|
1084
|
+
"tags": []
|
|
1085
|
+
},
|
|
1086
|
+
"source": [
|
|
1087
|
+
"## 2.8 Type Override (Optional)\n",
|
|
1088
|
+
"\n",
|
|
1089
|
+
"If any column types were incorrectly inferred, you can override them here.\n",
|
|
1090
|
+
"\n",
|
|
1091
|
+
"**Common overrides:**\n",
|
|
1092
|
+
"- Binary columns detected as numeric → `ColumnType.BINARY`\n",
|
|
1093
|
+
"- IDs detected as numeric → `ColumnType.IDENTIFIER`\n",
|
|
1094
|
+
"- Ordinal categories detected as nominal → `ColumnType.CATEGORICAL_ORDINAL`"
|
|
1095
|
+
]
|
|
1096
|
+
},
|
|
1097
|
+
{
|
|
1098
|
+
"cell_type": "code",
|
|
1099
|
+
"execution_count": null,
|
|
1100
|
+
"id": "00245d72",
|
|
1101
|
+
"metadata": {
|
|
1102
|
+
"execution": {
|
|
1103
|
+
"iopub.execute_input": "2026-02-02T13:03:04.430049Z",
|
|
1104
|
+
"iopub.status.busy": "2026-02-02T13:03:04.429934Z",
|
|
1105
|
+
"iopub.status.idle": "2026-02-02T13:03:04.432555Z",
|
|
1106
|
+
"shell.execute_reply": "2026-02-02T13:03:04.432110Z"
|
|
1107
|
+
},
|
|
1108
|
+
"papermill": {
|
|
1109
|
+
"duration": 0.105193,
|
|
1110
|
+
"end_time": "2026-02-02T13:03:04.433172",
|
|
1111
|
+
"exception": false,
|
|
1112
|
+
"start_time": "2026-02-02T13:03:04.327979",
|
|
1113
|
+
"status": "completed"
|
|
1114
|
+
},
|
|
1115
|
+
"tags": []
|
|
1116
|
+
},
|
|
1117
|
+
"outputs": [],
|
|
1118
|
+
"source": [
|
|
1119
|
+
"# === TYPE OVERRIDES ===\n",
|
|
1120
|
+
"# Uncomment and modify to override any incorrectly inferred types\n",
|
|
1121
|
+
"TYPE_OVERRIDES = {\n",
|
|
1122
|
+
" # \"column_name\": ColumnType.NEW_TYPE,\n",
|
|
1123
|
+
" # Examples:\n",
|
|
1124
|
+
" # \"is_active\": ColumnType.BINARY,\n",
|
|
1125
|
+
" # \"user_id\": ColumnType.IDENTIFIER,\n",
|
|
1126
|
+
" # \"satisfaction_level\": ColumnType.CATEGORICAL_ORDINAL,\n",
|
|
1127
|
+
"}\n",
|
|
1128
|
+
"\n",
|
|
1129
|
+
"if TYPE_OVERRIDES:\n",
|
|
1130
|
+
" print(\"Applying type overrides:\")\n",
|
|
1131
|
+
" for col_name, new_type in TYPE_OVERRIDES.items():\n",
|
|
1132
|
+
" if col_name in findings.columns:\n",
|
|
1133
|
+
" old_type = findings.columns[col_name].inferred_type.value\n",
|
|
1134
|
+
" findings.columns[col_name].inferred_type = new_type\n",
|
|
1135
|
+
" findings.columns[col_name].confidence = 1.0\n",
|
|
1136
|
+
" findings.columns[col_name].evidence.append(\"Manually overridden\")\n",
|
|
1137
|
+
" print(f\" {col_name}: {old_type} → {new_type.value}\")\n",
|
|
1138
|
+
"else:\n",
|
|
1139
|
+
" print(\"No type overrides configured.\")\n",
|
|
1140
|
+
" print(\"To override a type, add entries to TYPE_OVERRIDES dictionary above.\")"
|
|
1141
|
+
]
|
|
1142
|
+
},
|
|
1143
|
+
{
|
|
1144
|
+
"cell_type": "markdown",
|
|
1145
|
+
"id": "96593591",
|
|
1146
|
+
"metadata": {
|
|
1147
|
+
"papermill": {
|
|
1148
|
+
"duration": 0.102127,
|
|
1149
|
+
"end_time": "2026-02-02T13:03:04.673253",
|
|
1150
|
+
"exception": false,
|
|
1151
|
+
"start_time": "2026-02-02T13:03:04.571126",
|
|
1152
|
+
"status": "completed"
|
|
1153
|
+
},
|
|
1154
|
+
"tags": []
|
|
1155
|
+
},
|
|
1156
|
+
"source": [
|
|
1157
|
+
"## 2.9 Data Segmentation Analysis\n",
|
|
1158
|
+
"\n",
|
|
1159
|
+
"**Purpose:** Determine if the dataset contains natural subgroups that might benefit from separate models.\n",
|
|
1160
|
+
"\n",
|
|
1161
|
+
"**📖 Why This Matters:**\n",
|
|
1162
|
+
"- Some datasets have distinct customer segments with very different behaviors\n",
|
|
1163
|
+
"- A single model might struggle to capture patterns that vary significantly across segments\n",
|
|
1164
|
+
"- Segmented models can improve accuracy but add maintenance complexity\n",
|
|
1165
|
+
"\n",
|
|
1166
|
+
"**Recommendations:**\n",
|
|
1167
|
+
"- **single_model** - Data is homogeneous; one model for all records\n",
|
|
1168
|
+
"- **consider_segmentation** - Some variation exists; evaluate if complexity is worth it\n",
|
|
1169
|
+
"- **strong_segmentation** - Distinct segments with different target rates; separate models likely beneficial\n",
|
|
1170
|
+
"\n",
|
|
1171
|
+
"**Important:** This is exploratory guidance only. The final decision depends on business context, model complexity tolerance, and available resources."
|
|
1172
|
+
]
|
|
1173
|
+
},
|
|
1174
|
+
{
|
|
1175
|
+
"cell_type": "code",
|
|
1176
|
+
"execution_count": null,
|
|
1177
|
+
"id": "ad8d552a",
|
|
1178
|
+
"metadata": {
|
|
1179
|
+
"execution": {
|
|
1180
|
+
"iopub.execute_input": "2026-02-02T13:03:04.877339Z",
|
|
1181
|
+
"iopub.status.busy": "2026-02-02T13:03:04.877224Z",
|
|
1182
|
+
"iopub.status.idle": "2026-02-02T13:03:05.087709Z",
|
|
1183
|
+
"shell.execute_reply": "2026-02-02T13:03:05.086847Z"
|
|
1184
|
+
},
|
|
1185
|
+
"papermill": {
|
|
1186
|
+
"duration": 0.313839,
|
|
1187
|
+
"end_time": "2026-02-02T13:03:05.088528",
|
|
1188
|
+
"exception": false,
|
|
1189
|
+
"start_time": "2026-02-02T13:03:04.774689",
|
|
1190
|
+
"status": "completed"
|
|
1191
|
+
},
|
|
1192
|
+
"tags": []
|
|
1193
|
+
},
|
|
1194
|
+
"outputs": [],
|
|
1195
|
+
"source": [
|
|
1196
|
+
"from customer_retention.stages.profiling import SegmentAnalyzer\n",
|
|
1197
|
+
"\n",
|
|
1198
|
+
"# Initialize segment analyzer\n",
|
|
1199
|
+
"segment_analyzer = SegmentAnalyzer()\n",
|
|
1200
|
+
"\n",
|
|
1201
|
+
"# Find target column if detected\n",
|
|
1202
|
+
"target_col = None\n",
|
|
1203
|
+
"for col_name, col_info in findings.columns.items():\n",
|
|
1204
|
+
" if col_info.inferred_type == ColumnType.TARGET:\n",
|
|
1205
|
+
" target_col = col_name\n",
|
|
1206
|
+
" break\n",
|
|
1207
|
+
"\n",
|
|
1208
|
+
"# Run segmentation analysis using numeric features\n",
|
|
1209
|
+
"print(\"=\"*70)\n",
|
|
1210
|
+
"print(\"DATA SEGMENTATION ANALYSIS\")\n",
|
|
1211
|
+
"print(\"=\"*70)\n",
|
|
1212
|
+
"\n",
|
|
1213
|
+
"segmentation = segment_analyzer.analyze(\n",
|
|
1214
|
+
" df,\n",
|
|
1215
|
+
" target_col=target_col,\n",
|
|
1216
|
+
" feature_cols=numeric_cols if numeric_cols else None,\n",
|
|
1217
|
+
" max_segments=5\n",
|
|
1218
|
+
")\n",
|
|
1219
|
+
"\n",
|
|
1220
|
+
"print(f\"\\n🎯 Analysis Results:\")\n",
|
|
1221
|
+
"print(f\" Method: {segmentation.method.value}\")\n",
|
|
1222
|
+
"print(f\" Detected Segments: {segmentation.n_segments}\")\n",
|
|
1223
|
+
"print(f\" Cluster Quality Score: {segmentation.quality_score:.2f}\")\n",
|
|
1224
|
+
"if segmentation.target_variance_ratio is not None:\n",
|
|
1225
|
+
" print(f\" Target Variance Ratio: {segmentation.target_variance_ratio:.2f}\")\n",
|
|
1226
|
+
"\n",
|
|
1227
|
+
"print(f\"\\n📊 Segment Profiles:\")\n",
|
|
1228
|
+
"for profile in segmentation.profiles:\n",
|
|
1229
|
+
" target_info = f\" | Target Rate: {profile.target_rate*100:.1f}%\" if profile.target_rate is not None else \"\"\n",
|
|
1230
|
+
" print(f\" Segment {profile.segment_id}: {profile.size:,} records ({profile.size_pct:.1f}%){target_info}\")\n",
|
|
1231
|
+
"\n",
|
|
1232
|
+
"# Display recommendation card\n",
|
|
1233
|
+
"fig = charts.segment_recommendation_card(segmentation)\n",
|
|
1234
|
+
"display_figure(fig)\n",
|
|
1235
|
+
"\n",
|
|
1236
|
+
"# Display segment overview\n",
|
|
1237
|
+
"fig = charts.segment_overview(segmentation, title=\"Segment Overview\")\n",
|
|
1238
|
+
"display_figure(fig)\n",
|
|
1239
|
+
"\n",
|
|
1240
|
+
"# Display feature comparison if we have features\n",
|
|
1241
|
+
"if segmentation.n_segments > 1 and any(p.defining_features for p in segmentation.profiles):\n",
|
|
1242
|
+
" fig = charts.segment_feature_comparison(segmentation, title=\"Feature Comparison Across Segments\")\n",
|
|
1243
|
+
" display_figure(fig)\n",
|
|
1244
|
+
"\n",
|
|
1245
|
+
"print(f\"\\n📝 Rationale:\")\n",
|
|
1246
|
+
"for reason in segmentation.rationale:\n",
|
|
1247
|
+
" print(f\" • {reason}\")"
|
|
1248
|
+
]
|
|
1249
|
+
},
|
|
1250
|
+
{
|
|
1251
|
+
"cell_type": "markdown",
|
|
1252
|
+
"id": "2f3e371f",
|
|
1253
|
+
"metadata": {
|
|
1254
|
+
"papermill": {
|
|
1255
|
+
"duration": 0.144582,
|
|
1256
|
+
"end_time": "2026-02-02T13:03:05.348148",
|
|
1257
|
+
"exception": false,
|
|
1258
|
+
"start_time": "2026-02-02T13:03:05.203566",
|
|
1259
|
+
"status": "completed"
|
|
1260
|
+
},
|
|
1261
|
+
"tags": []
|
|
1262
|
+
},
|
|
1263
|
+
"source": [
|
|
1264
|
+
"## 2.10 Save Updated Findings"
|
|
1265
|
+
]
|
|
1266
|
+
},
|
|
1267
|
+
{
|
|
1268
|
+
"cell_type": "code",
|
|
1269
|
+
"execution_count": null,
|
|
1270
|
+
"id": "fc32cba0",
|
|
1271
|
+
"metadata": {
|
|
1272
|
+
"execution": {
|
|
1273
|
+
"iopub.execute_input": "2026-02-02T13:03:05.560506Z",
|
|
1274
|
+
"iopub.status.busy": "2026-02-02T13:03:05.560391Z",
|
|
1275
|
+
"iopub.status.idle": "2026-02-02T13:03:05.639998Z",
|
|
1276
|
+
"shell.execute_reply": "2026-02-02T13:03:05.639277Z"
|
|
1277
|
+
},
|
|
1278
|
+
"papermill": {
|
|
1279
|
+
"duration": 0.186106,
|
|
1280
|
+
"end_time": "2026-02-02T13:03:05.640834",
|
|
1281
|
+
"exception": false,
|
|
1282
|
+
"start_time": "2026-02-02T13:03:05.454728",
|
|
1283
|
+
"status": "completed"
|
|
1284
|
+
},
|
|
1285
|
+
"tags": []
|
|
1286
|
+
},
|
|
1287
|
+
"outputs": [],
|
|
1288
|
+
"source": [
|
|
1289
|
+
"# Save updated findings back to the same file\n",
|
|
1290
|
+
"findings.save(FINDINGS_PATH)\n",
|
|
1291
|
+
"print(f\"Updated findings saved to: {FINDINGS_PATH}\")\n",
|
|
1292
|
+
"\n",
|
|
1293
|
+
"# Save recommendations registry\n",
|
|
1294
|
+
"recommendations_path = FINDINGS_PATH.replace(\"_findings.yaml\", \"_recommendations.yaml\")\n",
|
|
1295
|
+
"registry.save(recommendations_path)\n",
|
|
1296
|
+
"print(f\"Recommendations saved to: {recommendations_path}\")\n",
|
|
1297
|
+
"\n",
|
|
1298
|
+
"# Summary of recommendations\n",
|
|
1299
|
+
"all_recs = registry.all_recommendations\n",
|
|
1300
|
+
"print(f\"\\n📋 Recommendations Summary:\")\n",
|
|
1301
|
+
"print(f\" Bronze layer: {len(registry.get_by_layer('bronze'))} recommendations\")\n",
|
|
1302
|
+
"print(f\" Silver layer: {len(registry.get_by_layer('silver'))} recommendations\")\n",
|
|
1303
|
+
"print(f\" Gold layer: {len(registry.get_by_layer('gold'))} recommendations\")\n",
|
|
1304
|
+
"print(f\" Total: {len(all_recs)} recommendations\")\n"
|
|
1305
|
+
]
|
|
1306
|
+
},
|
|
1307
|
+
{
|
|
1308
|
+
"cell_type": "markdown",
|
|
1309
|
+
"id": "4f8202b3",
|
|
1310
|
+
"metadata": {
|
|
1311
|
+
"papermill": {
|
|
1312
|
+
"duration": 0.14125,
|
|
1313
|
+
"end_time": "2026-02-02T13:03:05.892090",
|
|
1314
|
+
"exception": false,
|
|
1315
|
+
"start_time": "2026-02-02T13:03:05.750840",
|
|
1316
|
+
"status": "completed"
|
|
1317
|
+
},
|
|
1318
|
+
"tags": []
|
|
1319
|
+
},
|
|
1320
|
+
"source": [
|
|
1321
|
+
"---\n",
|
|
1322
|
+
"\n",
|
|
1323
|
+
"## Summary: What We Learned\n",
|
|
1324
|
+
"\n",
|
|
1325
|
+
"In this notebook, we performed a deep dive analysis that included:\n",
|
|
1326
|
+
"\n",
|
|
1327
|
+
"1. **Value Range Validation** - Validated rates, binary fields, and non-negative constraints\n",
|
|
1328
|
+
"2. **Numeric Distribution Analysis** - Calculated skewness, kurtosis, and percentiles with transformation recommendations\n",
|
|
1329
|
+
"3. **Categorical Distribution Analysis** - Calculated imbalance ratio, entropy, and concentration with encoding recommendations\n",
|
|
1330
|
+
"4. **Datetime Analysis** - Analyzed seasonality, trends, and patterns with feature engineering recommendations\n",
|
|
1331
|
+
"5. **Data Segmentation** - Evaluated if natural subgroups exist that might benefit from separate models\n",
|
|
1332
|
+
"\n",
|
|
1333
|
+
"## Key Metrics Reference\n",
|
|
1334
|
+
"\n",
|
|
1335
|
+
"**Numeric Columns:**\n",
|
|
1336
|
+
"| Metric | Threshold | Action |\n",
|
|
1337
|
+
"|--------|-----------|--------|\n",
|
|
1338
|
+
"| Skewness | \\|skew\\| > 1 | Log transform |\n",
|
|
1339
|
+
"| Kurtosis | > 10 | Cap outliers first |\n",
|
|
1340
|
+
"| Zero % | > 40% | Zero-inflation handling |\n",
|
|
1341
|
+
"\n",
|
|
1342
|
+
"**Categorical Columns:**\n",
|
|
1343
|
+
"| Metric | Threshold | Action |\n",
|
|
1344
|
+
"|--------|-----------|--------|\n",
|
|
1345
|
+
"| Imbalance Ratio | > 10x | Group rare categories |\n",
|
|
1346
|
+
"| Entropy | < 50% | Stratified sampling |\n",
|
|
1347
|
+
"| Rare Categories | > 0 | Group into \"Other\" |\n",
|
|
1348
|
+
"\n",
|
|
1349
|
+
"**Datetime Columns:**\n",
|
|
1350
|
+
"| Finding | Action |\n",
|
|
1351
|
+
"|---------|--------|\n",
|
|
1352
|
+
"| Seasonality | Add cyclical month encoding |\n",
|
|
1353
|
+
"| Strong trend | Time-based train/test split |\n",
|
|
1354
|
+
"| Multiple dates | Calculate duration features |\n",
|
|
1355
|
+
"| Placeholder dates | Filter or flag |\n",
|
|
1356
|
+
"\n",
|
|
1357
|
+
"## Transformation & Encoding Summary\n",
|
|
1358
|
+
"\n",
|
|
1359
|
+
"Review the summary tables above for:\n",
|
|
1360
|
+
"- **Numeric**: Which columns need log transforms, capping, or zero-inflation handling\n",
|
|
1361
|
+
"- **Categorical**: Which encoding to use and whether to group rare categories\n",
|
|
1362
|
+
"- **Datetime**: Which temporal features to engineer based on detected patterns\n",
|
|
1363
|
+
"\n",
|
|
1364
|
+
"---\n",
|
|
1365
|
+
"\n",
|
|
1366
|
+
"## Next Steps\n",
|
|
1367
|
+
"\n",
|
|
1368
|
+
"Continue to **03_quality_assessment.ipynb** to:\n",
|
|
1369
|
+
"- Analyze duplicate records and value conflicts\n",
|
|
1370
|
+
"- Deep dive into missing value patterns\n",
|
|
1371
|
+
"- Analyze outliers with IQR method\n",
|
|
1372
|
+
"- Check data consistency\n",
|
|
1373
|
+
"- Get cleaning recommendations\n",
|
|
1374
|
+
"\n",
|
|
1375
|
+
"Or jump to **05_feature_opportunities.ipynb** if you want to see derived feature recommendations."
|
|
1376
|
+
]
|
|
1377
|
+
},
|
|
1378
|
+
{
|
|
1379
|
+
"cell_type": "markdown",
|
|
1380
|
+
"id": "f114cb0a",
|
|
1381
|
+
"metadata": {
|
|
1382
|
+
"papermill": {
|
|
1383
|
+
"duration": 0.114526,
|
|
1384
|
+
"end_time": "2026-02-02T13:03:06.112557",
|
|
1385
|
+
"exception": false,
|
|
1386
|
+
"start_time": "2026-02-02T13:03:05.998031",
|
|
1387
|
+
"status": "completed"
|
|
1388
|
+
},
|
|
1389
|
+
"tags": []
|
|
1390
|
+
},
|
|
1391
|
+
"source": [
|
|
1392
|
+
"> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
|
|
1393
|
+
"> The next notebook will automatically export this notebook's HTML documentation from the saved file."
|
|
1394
|
+
]
|
|
1395
|
+
}
|
|
1396
|
+
],
|
|
1397
|
+
"metadata": {
|
|
1398
|
+
"kernelspec": {
|
|
1399
|
+
"display_name": "Python 3",
|
|
1400
|
+
"language": "python",
|
|
1401
|
+
"name": "python3"
|
|
1402
|
+
},
|
|
1403
|
+
"language_info": {
|
|
1404
|
+
"codemirror_mode": {
|
|
1405
|
+
"name": "ipython",
|
|
1406
|
+
"version": 3
|
|
1407
|
+
},
|
|
1408
|
+
"file_extension": ".py",
|
|
1409
|
+
"mimetype": "text/x-python",
|
|
1410
|
+
"name": "python",
|
|
1411
|
+
"nbconvert_exporter": "python",
|
|
1412
|
+
"pygments_lexer": "ipython3",
|
|
1413
|
+
"version": "3.12.4"
|
|
1414
|
+
},
|
|
1415
|
+
"papermill": {
|
|
1416
|
+
"default_parameters": {},
|
|
1417
|
+
"duration": 8.078058,
|
|
1418
|
+
"end_time": "2026-02-02T13:03:06.637909",
|
|
1419
|
+
"environment_variables": {},
|
|
1420
|
+
"exception": null,
|
|
1421
|
+
"input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/02_column_deep_dive.ipynb",
|
|
1422
|
+
"output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/02_column_deep_dive.ipynb",
|
|
1423
|
+
"parameters": {},
|
|
1424
|
+
"start_time": "2026-02-02T13:02:58.559851",
|
|
1425
|
+
"version": "2.6.0"
|
|
1426
|
+
}
|
|
1427
|
+
},
|
|
1428
|
+
"nbformat": 4,
|
|
1429
|
+
"nbformat_minor": 5
|
|
1430
|
+
}
|