churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb
ADDED
|
@@ -0,0 +1,1624 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "ceca3f3f",
|
|
6
|
+
"metadata": {
|
|
7
|
+
"papermill": {
|
|
8
|
+
"duration": 0.003774,
|
|
9
|
+
"end_time": "2026-02-02T13:03:27.226665",
|
|
10
|
+
"exception": false,
|
|
11
|
+
"start_time": "2026-02-02T13:03:27.222891",
|
|
12
|
+
"status": "completed"
|
|
13
|
+
},
|
|
14
|
+
"tags": []
|
|
15
|
+
},
|
|
16
|
+
"source": [
|
|
17
|
+
"# Chapter 6: Feature Opportunities\n",
|
|
18
|
+
"\n",
|
|
19
|
+
"**Purpose:** Identify and implement feature engineering opportunities to improve model performance.\n",
|
|
20
|
+
"\n",
|
|
21
|
+
"**What you'll learn:**\n",
|
|
22
|
+
"- How to derive time-based features (tenure, recency, active period)\n",
|
|
23
|
+
"- How to create composite engagement scores\n",
|
|
24
|
+
"- How to segment customers based on behavior patterns\n",
|
|
25
|
+
"- How to encode categorical variables effectively\n",
|
|
26
|
+
"\n",
|
|
27
|
+
"**Outputs:**\n",
|
|
28
|
+
"- Derived feature recommendations with code examples\n",
|
|
29
|
+
"- Composite score formulas (engagement, service adoption)\n",
|
|
30
|
+
"- Customer segmentation rules\n",
|
|
31
|
+
"- Categorical encoding strategies\n",
|
|
32
|
+
"\n",
|
|
33
|
+
"---\n",
|
|
34
|
+
"\n",
|
|
35
|
+
"## Why Feature Engineering Matters\n",
|
|
36
|
+
"\n",
|
|
37
|
+
"| Feature Type | Business Meaning | Predictive Power |\n",
|
|
38
|
+
"|-------------|-----------------|------------------|\n",
|
|
39
|
+
"| **Tenure** | How long customer has been with us | Loyalty indicator |\n",
|
|
40
|
+
"| **Recency** | Days since last order | Engagement/churn signal |\n",
|
|
41
|
+
"| **Engagement Score** | Combined email metrics | Overall engagement level |\n",
|
|
42
|
+
"| **Segments** | High/Low value × Frequent/Infrequent | Risk stratification |"
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"cell_type": "markdown",
|
|
47
|
+
"id": "4dd512f5",
|
|
48
|
+
"metadata": {
|
|
49
|
+
"papermill": {
|
|
50
|
+
"duration": 0.00238,
|
|
51
|
+
"end_time": "2026-02-02T13:03:27.232489",
|
|
52
|
+
"exception": false,
|
|
53
|
+
"start_time": "2026-02-02T13:03:27.230109",
|
|
54
|
+
"status": "completed"
|
|
55
|
+
},
|
|
56
|
+
"tags": []
|
|
57
|
+
},
|
|
58
|
+
"source": [
|
|
59
|
+
"## 6.1 Setup"
|
|
60
|
+
]
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"cell_type": "code",
|
|
64
|
+
"execution_count": null,
|
|
65
|
+
"id": "485495bd",
|
|
66
|
+
"metadata": {
|
|
67
|
+
"execution": {
|
|
68
|
+
"iopub.execute_input": "2026-02-02T13:03:27.239137Z",
|
|
69
|
+
"iopub.status.busy": "2026-02-02T13:03:27.239011Z",
|
|
70
|
+
"iopub.status.idle": "2026-02-02T13:03:29.003230Z",
|
|
71
|
+
"shell.execute_reply": "2026-02-02T13:03:29.002430Z"
|
|
72
|
+
},
|
|
73
|
+
"papermill": {
|
|
74
|
+
"duration": 1.768958,
|
|
75
|
+
"end_time": "2026-02-02T13:03:29.004317",
|
|
76
|
+
"exception": false,
|
|
77
|
+
"start_time": "2026-02-02T13:03:27.235359",
|
|
78
|
+
"status": "completed"
|
|
79
|
+
},
|
|
80
|
+
"tags": []
|
|
81
|
+
},
|
|
82
|
+
"outputs": [],
|
|
83
|
+
"source": [
|
|
84
|
+
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
85
|
+
"track_and_export_previous(\"06_feature_opportunities.ipynb\")\n",
|
|
86
|
+
"\n",
|
|
87
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationEngine, RecommendationRegistry\n",
|
|
88
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
89
|
+
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
90
|
+
"from customer_retention.stages.features import CustomerSegmenter, SegmentationType\n",
|
|
91
|
+
"from customer_retention.stages.profiling import FeatureCapacityAnalyzer\n",
|
|
92
|
+
"import yaml\n",
|
|
93
|
+
"import pandas as pd\n",
|
|
94
|
+
"import numpy as np\n",
|
|
95
|
+
"import plotly.graph_objects as go\n",
|
|
96
|
+
"import plotly.express as px\n",
|
|
97
|
+
"from plotly.subplots import make_subplots\n",
|
|
98
|
+
"from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
|
|
99
|
+
]
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
"cell_type": "code",
|
|
103
|
+
"execution_count": null,
|
|
104
|
+
"id": "f708610d",
|
|
105
|
+
"metadata": {
|
|
106
|
+
"execution": {
|
|
107
|
+
"iopub.execute_input": "2026-02-02T13:03:29.021480Z",
|
|
108
|
+
"iopub.status.busy": "2026-02-02T13:03:29.021296Z",
|
|
109
|
+
"iopub.status.idle": "2026-02-02T13:03:29.486668Z",
|
|
110
|
+
"shell.execute_reply": "2026-02-02T13:03:29.485969Z"
|
|
111
|
+
},
|
|
112
|
+
"papermill": {
|
|
113
|
+
"duration": 0.474341,
|
|
114
|
+
"end_time": "2026-02-02T13:03:29.487573",
|
|
115
|
+
"exception": false,
|
|
116
|
+
"start_time": "2026-02-02T13:03:29.013232",
|
|
117
|
+
"status": "completed"
|
|
118
|
+
},
|
|
119
|
+
"tags": []
|
|
120
|
+
},
|
|
121
|
+
"outputs": [],
|
|
122
|
+
"source": [
|
|
123
|
+
"# === CONFIGURATION ===\n",
|
|
124
|
+
"# Option 1: Set the exact path from notebook 01 output\n",
|
|
125
|
+
"# FINDINGS_PATH = \"../experiments/findings/customer_retention_retail_abc123_findings.yaml\"\n",
|
|
126
|
+
"\n",
|
|
127
|
+
"# Option 2: Auto-discover the most recent findings file\n",
|
|
128
|
+
"from pathlib import Path\n",
|
|
129
|
+
"\n",
|
|
130
|
+
"# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
|
|
131
|
+
"\n",
|
|
132
|
+
"findings_files = [f for f in FINDINGS_DIR.glob(\"*_findings.yaml\") if \"multi_dataset\" not in f.name]\n",
|
|
133
|
+
"if not findings_files:\n",
|
|
134
|
+
" raise FileNotFoundError(f\"No findings files found in {FINDINGS_DIR}. Run notebook 01 first.\")\n",
|
|
135
|
+
"\n",
|
|
136
|
+
"# Prefer aggregated findings (from 01d) over event-level findings\n",
|
|
137
|
+
"# Pattern: *_aggregated* in filename indicates aggregated data\n",
|
|
138
|
+
"aggregated_files = [f for f in findings_files if \"_aggregated\" in f.name]\n",
|
|
139
|
+
"non_aggregated_files = [f for f in findings_files if \"_aggregated\" not in f.name]\n",
|
|
140
|
+
"\n",
|
|
141
|
+
"if aggregated_files:\n",
|
|
142
|
+
" # Use most recent aggregated file\n",
|
|
143
|
+
" aggregated_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
144
|
+
" FINDINGS_PATH = str(aggregated_files[0])\n",
|
|
145
|
+
" print(f\"Found {len(aggregated_files)} aggregated findings file(s)\")\n",
|
|
146
|
+
" print(f\"Using: {FINDINGS_PATH}\")\n",
|
|
147
|
+
" if non_aggregated_files:\n",
|
|
148
|
+
" print(f\" (Skipping {len(non_aggregated_files)} event-level findings)\")\n",
|
|
149
|
+
"else:\n",
|
|
150
|
+
" # Fall back to most recent non-aggregated file\n",
|
|
151
|
+
" non_aggregated_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
152
|
+
" FINDINGS_PATH = str(non_aggregated_files[0])\n",
|
|
153
|
+
" print(f\"Found {len(findings_files)} findings file(s)\")\n",
|
|
154
|
+
" print(f\"Using: {FINDINGS_PATH}\")\n",
|
|
155
|
+
"\n",
|
|
156
|
+
"RECOMMENDATIONS_PATH = FINDINGS_PATH.replace(\"_findings.yaml\", \"_recommendations.yaml\")\n",
|
|
157
|
+
"\n",
|
|
158
|
+
"findings = ExplorationFindings.load(FINDINGS_PATH)\n",
|
|
159
|
+
"\n",
|
|
160
|
+
"# Load data - handle aggregated vs standard paths\n",
|
|
161
|
+
"from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
|
|
162
|
+
"\n",
|
|
163
|
+
"# For aggregated data, load directly from the parquet source\n",
|
|
164
|
+
"if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
|
|
165
|
+
" source_path = Path(findings.source_path)\n",
|
|
166
|
+
" # Handle relative path from notebook directory\n",
|
|
167
|
+
" if not source_path.is_absolute():\n",
|
|
168
|
+
" # The source_path in findings is relative to project root\n",
|
|
169
|
+
" if str(source_path).startswith(\"experiments\"):\n",
|
|
170
|
+
" source_path = Path(\"..\") / source_path\n",
|
|
171
|
+
" else:\n",
|
|
172
|
+
" source_path = FINDINGS_DIR / source_path.name\n",
|
|
173
|
+
" df = pd.read_parquet(source_path)\n",
|
|
174
|
+
" data_source = f\"aggregated:{source_path.name}\"\n",
|
|
175
|
+
"else:\n",
|
|
176
|
+
" # Standard loading for event-level or entity-level data\n",
|
|
177
|
+
" df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
|
|
178
|
+
"\n",
|
|
179
|
+
"charts = ChartBuilder()\n",
|
|
180
|
+
"\n",
|
|
181
|
+
"if Path(RECOMMENDATIONS_PATH).exists():\n",
|
|
182
|
+
" with open(RECOMMENDATIONS_PATH, \"r\") as f:\n",
|
|
183
|
+
" registry = RecommendationRegistry.from_dict(yaml.safe_load(f))\n",
|
|
184
|
+
" print(f\"Loaded existing recommendations: {len(registry.all_recommendations)} total\")\n",
|
|
185
|
+
"else:\n",
|
|
186
|
+
" registry = RecommendationRegistry()\n",
|
|
187
|
+
" print(\"Initialized new recommendation registry\")\n",
|
|
188
|
+
"\n",
|
|
189
|
+
"# Ensure all layers are initialized (even if loaded from file)\n",
|
|
190
|
+
"if not registry.bronze:\n",
|
|
191
|
+
" registry.init_bronze(findings.source_path)\n",
|
|
192
|
+
"if not registry.silver:\n",
|
|
193
|
+
" registry.init_silver(findings.entity_column or \"entity_id\")\n",
|
|
194
|
+
"if not registry.gold:\n",
|
|
195
|
+
" registry.init_gold(findings.target_column or \"target\")\n",
|
|
196
|
+
" print(\" Initialized gold layer for feature engineering recommendations\")\n",
|
|
197
|
+
"\n",
|
|
198
|
+
"print(f\"\\nLoaded {len(df):,} rows from: {data_source}\")"
|
|
199
|
+
]
|
|
200
|
+
},
|
|
201
|
+
{
|
|
202
|
+
"cell_type": "markdown",
|
|
203
|
+
"id": "1f934e52",
|
|
204
|
+
"metadata": {
|
|
205
|
+
"papermill": {
|
|
206
|
+
"duration": 0.002793,
|
|
207
|
+
"end_time": "2026-02-02T13:03:29.493261",
|
|
208
|
+
"exception": false,
|
|
209
|
+
"start_time": "2026-02-02T13:03:29.490468",
|
|
210
|
+
"status": "completed"
|
|
211
|
+
},
|
|
212
|
+
"tags": []
|
|
213
|
+
},
|
|
214
|
+
"source": [
|
|
215
|
+
"## 6.2 Automated Feature Recommendations"
|
|
216
|
+
]
|
|
217
|
+
},
|
|
218
|
+
{
|
|
219
|
+
"cell_type": "code",
|
|
220
|
+
"execution_count": null,
|
|
221
|
+
"id": "48774e06",
|
|
222
|
+
"metadata": {
|
|
223
|
+
"execution": {
|
|
224
|
+
"iopub.execute_input": "2026-02-02T13:03:29.499317Z",
|
|
225
|
+
"iopub.status.busy": "2026-02-02T13:03:29.499150Z",
|
|
226
|
+
"iopub.status.idle": "2026-02-02T13:03:29.504187Z",
|
|
227
|
+
"shell.execute_reply": "2026-02-02T13:03:29.503639Z"
|
|
228
|
+
},
|
|
229
|
+
"papermill": {
|
|
230
|
+
"duration": 0.008632,
|
|
231
|
+
"end_time": "2026-02-02T13:03:29.504689",
|
|
232
|
+
"exception": false,
|
|
233
|
+
"start_time": "2026-02-02T13:03:29.496057",
|
|
234
|
+
"status": "completed"
|
|
235
|
+
},
|
|
236
|
+
"tags": []
|
|
237
|
+
},
|
|
238
|
+
"outputs": [],
|
|
239
|
+
"source": [
|
|
240
|
+
"recommender = RecommendationEngine()\n",
|
|
241
|
+
"feature_recs = recommender.recommend_features(findings)\n",
|
|
242
|
+
"\n",
|
|
243
|
+
"print(f\"Found {len(feature_recs)} feature engineering opportunities:\\n\")\n",
|
|
244
|
+
"\n",
|
|
245
|
+
"for rec in feature_recs:\n",
|
|
246
|
+
" print(f\"{rec.feature_name}\")\n",
|
|
247
|
+
" print(f\" Source: {rec.source_column}\")\n",
|
|
248
|
+
" print(f\" Type: {rec.feature_type}\")\n",
|
|
249
|
+
" print(f\" Priority: {rec.priority}\")\n",
|
|
250
|
+
" print(f\" Description: {rec.description}\")\n",
|
|
251
|
+
" print()"
|
|
252
|
+
]
|
|
253
|
+
},
|
|
254
|
+
{
|
|
255
|
+
"cell_type": "markdown",
|
|
256
|
+
"id": "414d93c2",
|
|
257
|
+
"metadata": {
|
|
258
|
+
"papermill": {
|
|
259
|
+
"duration": 0.002979,
|
|
260
|
+
"end_time": "2026-02-02T13:03:29.510845",
|
|
261
|
+
"exception": false,
|
|
262
|
+
"start_time": "2026-02-02T13:03:29.507866",
|
|
263
|
+
"status": "completed"
|
|
264
|
+
},
|
|
265
|
+
"tags": []
|
|
266
|
+
},
|
|
267
|
+
"source": [
|
|
268
|
+
"## 6.3 Feature Capacity Analysis\n",
|
|
269
|
+
"\n",
|
|
270
|
+
"**📖 Understanding Feature-to-Data Ratios**\n",
|
|
271
|
+
"\n",
|
|
272
|
+
"Before creating new features, it's critical to understand how many features your data can reliably support. This analysis uses the **Events Per Variable (EPV)** principle:\n",
|
|
273
|
+
"\n",
|
|
274
|
+
"| EPV Level | Risk Level | Recommendations |\n",
|
|
275
|
+
"|-----------|------------|-----------------|\n",
|
|
276
|
+
"| **EPV ≥ 20** | Low risk | Stable coefficients, reliable inference |\n",
|
|
277
|
+
"| **EPV = 10-20** | Moderate | Standard practice, consider regularization |\n",
|
|
278
|
+
"| **EPV = 5-10** | Elevated | Strong regularization required (L1/Lasso) |\n",
|
|
279
|
+
"| **EPV < 5** | High risk | Reduce features or collect more data |\n",
|
|
280
|
+
"\n",
|
|
281
|
+
"**Key Assumptions:**\n",
|
|
282
|
+
"1. **Minority class drives capacity**: For classification, the smaller class limits feature count\n",
|
|
283
|
+
"2. **Correlated features are redundant**: Highly correlated features (r > 0.8) count as ~1 effective feature\n",
|
|
284
|
+
"3. **Model type matters**: Tree models are more flexible than linear models\n",
|
|
285
|
+
"4. **Regularization helps**: L1/L2 penalties allow more features with less data\n",
|
|
286
|
+
"\n",
|
|
287
|
+
"**📊 What This Analysis Provides:**\n",
|
|
288
|
+
"- Recommended feature counts (conservative/moderate/aggressive)\n",
|
|
289
|
+
"- Effective feature count after removing redundancy\n",
|
|
290
|
+
"- Model complexity guidance (linear vs tree-based)\n",
|
|
291
|
+
"- Segment-specific capacity for multi-model strategies"
|
|
292
|
+
]
|
|
293
|
+
},
|
|
294
|
+
{
|
|
295
|
+
"cell_type": "code",
|
|
296
|
+
"execution_count": null,
|
|
297
|
+
"id": "fceefe09",
|
|
298
|
+
"metadata": {
|
|
299
|
+
"execution": {
|
|
300
|
+
"iopub.execute_input": "2026-02-02T13:03:29.516821Z",
|
|
301
|
+
"iopub.status.busy": "2026-02-02T13:03:29.516704Z",
|
|
302
|
+
"iopub.status.idle": "2026-02-02T13:03:29.553252Z",
|
|
303
|
+
"shell.execute_reply": "2026-02-02T13:03:29.548370Z"
|
|
304
|
+
},
|
|
305
|
+
"papermill": {
|
|
306
|
+
"duration": 0.040696,
|
|
307
|
+
"end_time": "2026-02-02T13:03:29.554085",
|
|
308
|
+
"exception": false,
|
|
309
|
+
"start_time": "2026-02-02T13:03:29.513389",
|
|
310
|
+
"status": "completed"
|
|
311
|
+
},
|
|
312
|
+
"tags": []
|
|
313
|
+
},
|
|
314
|
+
"outputs": [],
|
|
315
|
+
"source": [
|
|
316
|
+
"# Feature Capacity Analysis\n",
|
|
317
|
+
"capacity_analyzer = FeatureCapacityAnalyzer()\n",
|
|
318
|
+
"\n",
|
|
319
|
+
"# Get all potential feature columns (excluding target and identifiers)\n",
|
|
320
|
+
"feature_cols = [\n",
|
|
321
|
+
" name for name, col in findings.columns.items()\n",
|
|
322
|
+
" if col.inferred_type in [\n",
|
|
323
|
+
" ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE,\n",
|
|
324
|
+
" ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL,\n",
|
|
325
|
+
" ColumnType.BINARY\n",
|
|
326
|
+
" ] and name != findings.target_column\n",
|
|
327
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
328
|
+
"]\n",
|
|
329
|
+
"\n",
|
|
330
|
+
"print(\"=\" * 80)\n",
|
|
331
|
+
"print(\"FEATURE CAPACITY ANALYSIS\")\n",
|
|
332
|
+
"print(\"=\" * 80)\n",
|
|
333
|
+
"\n",
|
|
334
|
+
"if findings.target_column:\n",
|
|
335
|
+
" # Analyze capacity with current features\n",
|
|
336
|
+
" numeric_features = [\n",
|
|
337
|
+
" name for name, col in findings.columns.items()\n",
|
|
338
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
339
|
+
" and name != findings.target_column\n",
|
|
340
|
+
" ]\n",
|
|
341
|
+
" \n",
|
|
342
|
+
" capacity_result = capacity_analyzer.analyze(\n",
|
|
343
|
+
" df,\n",
|
|
344
|
+
" feature_cols=numeric_features,\n",
|
|
345
|
+
" target_col=findings.target_column,\n",
|
|
346
|
+
" )\n",
|
|
347
|
+
" \n",
|
|
348
|
+
" print(f\"\\n📊 DATA SUMMARY:\")\n",
|
|
349
|
+
" print(f\" Total samples: {capacity_result.total_samples:,}\")\n",
|
|
350
|
+
" print(f\" Minority class samples: {capacity_result.minority_class_samples:,}\")\n",
|
|
351
|
+
" print(f\" Minority class rate: {capacity_result.minority_class_samples/capacity_result.total_samples:.1%}\")\n",
|
|
352
|
+
" print(f\" Current numeric features: {capacity_result.total_features}\")\n",
|
|
353
|
+
" \n",
|
|
354
|
+
" print(f\"\\n📈 FEATURE CAPACITY METRICS:\")\n",
|
|
355
|
+
" print(f\" Events Per Variable (EPV): {capacity_result.events_per_variable:.1f}\")\n",
|
|
356
|
+
" print(f\" Samples Per Feature: {capacity_result.samples_per_feature:.1f}\")\n",
|
|
357
|
+
" print(f\" Capacity Status: {capacity_result.capacity_status.upper()}\")\n",
|
|
358
|
+
" \n",
|
|
359
|
+
" # Capacity status visualization\n",
|
|
360
|
+
" status_colors = {\"adequate\": \"#2ecc71\", \"limited\": \"#f39c12\", \"inadequate\": \"#e74c3c\"}\n",
|
|
361
|
+
" status_color = status_colors.get(capacity_result.capacity_status, \"#95a5a6\")\n",
|
|
362
|
+
" \n",
|
|
363
|
+
" print(f\"\\n🎯 RECOMMENDED FEATURE COUNTS:\")\n",
|
|
364
|
+
" print(f\" Conservative (EPV=20): {capacity_result.recommended_features_conservative} features\")\n",
|
|
365
|
+
" print(f\" Moderate (EPV=10): {capacity_result.recommended_features_moderate} features\")\n",
|
|
366
|
+
" print(f\" Aggressive (EPV=5): {capacity_result.recommended_features_aggressive} features\")\n",
|
|
367
|
+
" \n",
|
|
368
|
+
" # Effective features analysis\n",
|
|
369
|
+
" if capacity_result.effective_features_result:\n",
|
|
370
|
+
" eff = capacity_result.effective_features_result\n",
|
|
371
|
+
" print(f\"\\n🔍 EFFECTIVE FEATURES (accounting for correlation):\")\n",
|
|
372
|
+
" print(f\" Total features analyzed: {eff.total_count}\")\n",
|
|
373
|
+
" print(f\" Effective independent features: {eff.effective_count:.1f}\")\n",
|
|
374
|
+
" print(f\" Redundant features identified: {len(eff.redundant_features)}\")\n",
|
|
375
|
+
" \n",
|
|
376
|
+
" if eff.redundant_features:\n",
|
|
377
|
+
" print(f\"\\n ⚠️ Redundant features (highly correlated):\")\n",
|
|
378
|
+
" for feat in eff.redundant_features[:5]:\n",
|
|
379
|
+
" print(f\" • {feat}\")\n",
|
|
380
|
+
" \n",
|
|
381
|
+
" if eff.feature_clusters:\n",
|
|
382
|
+
" print(f\"\\n 📦 Correlated feature clusters ({len(eff.feature_clusters)}):\")\n",
|
|
383
|
+
" for i, cluster in enumerate(eff.feature_clusters[:3]):\n",
|
|
384
|
+
" print(f\" Cluster {i+1}: {', '.join(cluster[:4])}\")\n",
|
|
385
|
+
" if len(cluster) > 4:\n",
|
|
386
|
+
" print(f\" ... and {len(cluster)-4} more\")\n",
|
|
387
|
+
" \n",
|
|
388
|
+
" # Persist feature capacity to registry\n",
|
|
389
|
+
" registry.add_bronze_feature_capacity(\n",
|
|
390
|
+
" epv=capacity_result.events_per_variable,\n",
|
|
391
|
+
" capacity_status=capacity_result.capacity_status,\n",
|
|
392
|
+
" recommended_features=capacity_result.recommended_features_moderate,\n",
|
|
393
|
+
" current_features=capacity_result.total_features,\n",
|
|
394
|
+
" rationale=f\"EPV={capacity_result.events_per_variable:.1f}, status={capacity_result.capacity_status}\",\n",
|
|
395
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
396
|
+
" )\n",
|
|
397
|
+
" print(f\"\\n✅ Persisted feature capacity recommendation to registry\")\n",
|
|
398
|
+
" \n",
|
|
399
|
+
" # Store capacity info in findings\n",
|
|
400
|
+
" findings.metadata[\"feature_capacity\"] = capacity_result.to_dict()\n",
|
|
401
|
+
"else:\n",
|
|
402
|
+
" print(\"\\n⚠️ No target column detected. Capacity analysis requires a target variable.\")"
|
|
403
|
+
]
|
|
404
|
+
},
|
|
405
|
+
{
|
|
406
|
+
"cell_type": "markdown",
|
|
407
|
+
"id": "5b522958",
|
|
408
|
+
"metadata": {
|
|
409
|
+
"papermill": {
|
|
410
|
+
"duration": 0.00341,
|
|
411
|
+
"end_time": "2026-02-02T13:03:29.561633",
|
|
412
|
+
"exception": false,
|
|
413
|
+
"start_time": "2026-02-02T13:03:29.558223",
|
|
414
|
+
"status": "completed"
|
|
415
|
+
},
|
|
416
|
+
"tags": []
|
|
417
|
+
},
|
|
418
|
+
"source": [
|
|
419
|
+
"### 6.3.1 Model Complexity Guidance\n",
|
|
420
|
+
"\n",
|
|
421
|
+
"Based on your data capacity, here's guidance on model complexity and feature limits."
|
|
422
|
+
]
|
|
423
|
+
},
|
|
424
|
+
{
|
|
425
|
+
"cell_type": "code",
|
|
426
|
+
"execution_count": null,
|
|
427
|
+
"id": "a744f2a2",
|
|
428
|
+
"metadata": {
|
|
429
|
+
"execution": {
|
|
430
|
+
"iopub.execute_input": "2026-02-02T13:03:29.568292Z",
|
|
431
|
+
"iopub.status.busy": "2026-02-02T13:03:29.568182Z",
|
|
432
|
+
"iopub.status.idle": "2026-02-02T13:03:29.594267Z",
|
|
433
|
+
"shell.execute_reply": "2026-02-02T13:03:29.593686Z"
|
|
434
|
+
},
|
|
435
|
+
"papermill": {
|
|
436
|
+
"duration": 0.030556,
|
|
437
|
+
"end_time": "2026-02-02T13:03:29.595093",
|
|
438
|
+
"exception": false,
|
|
439
|
+
"start_time": "2026-02-02T13:03:29.564537",
|
|
440
|
+
"status": "completed"
|
|
441
|
+
},
|
|
442
|
+
"tags": []
|
|
443
|
+
},
|
|
444
|
+
"outputs": [],
|
|
445
|
+
"source": [
|
|
446
|
+
"# Model Complexity Guidance\n",
|
|
447
|
+
"if findings.target_column and 'capacity_result' in dir():\n",
|
|
448
|
+
" guidance = capacity_result.complexity_guidance\n",
|
|
449
|
+
" \n",
|
|
450
|
+
" print(\"=\" * 70)\n",
|
|
451
|
+
" print(\"MODEL COMPLEXITY GUIDANCE\")\n",
|
|
452
|
+
" print(\"=\" * 70)\n",
|
|
453
|
+
" \n",
|
|
454
|
+
" # Create visualization of feature limits by model type\n",
|
|
455
|
+
" model_types = [\"Linear\\n(no regularization)\", \"Regularized\\n(L1/L2)\", \"Tree-based\\n(RF/XGBoost)\"]\n",
|
|
456
|
+
" max_features = [guidance.max_features_linear, guidance.max_features_regularized, guidance.max_features_tree]\n",
|
|
457
|
+
" current_features = capacity_result.total_features\n",
|
|
458
|
+
" \n",
|
|
459
|
+
" colors = ['#e74c3c' if m < current_features else '#2ecc71' for m in max_features]\n",
|
|
460
|
+
" \n",
|
|
461
|
+
" fig = go.Figure()\n",
|
|
462
|
+
" \n",
|
|
463
|
+
" fig.add_trace(go.Bar(\n",
|
|
464
|
+
" x=model_types,\n",
|
|
465
|
+
" y=max_features,\n",
|
|
466
|
+
" marker_color=colors,\n",
|
|
467
|
+
" text=[f\"{m}\" for m in max_features],\n",
|
|
468
|
+
" textposition='outside',\n",
|
|
469
|
+
" name='Max Features'\n",
|
|
470
|
+
" ))\n",
|
|
471
|
+
" \n",
|
|
472
|
+
" # Add horizontal line for current feature count\n",
|
|
473
|
+
" fig.add_hline(\n",
|
|
474
|
+
" y=current_features,\n",
|
|
475
|
+
" line_dash=\"dash\",\n",
|
|
476
|
+
" line_color=\"#3498db\",\n",
|
|
477
|
+
" annotation_text=f\"Current: {current_features}\",\n",
|
|
478
|
+
" annotation_position=\"right\"\n",
|
|
479
|
+
" )\n",
|
|
480
|
+
" \n",
|
|
481
|
+
" # Calculate y-axis range to fit labels\n",
|
|
482
|
+
" max_val = max(max_features)\n",
|
|
483
|
+
" fig.update_layout(\n",
|
|
484
|
+
" title=\"Maximum Recommended Features by Model Type\",\n",
|
|
485
|
+
" xaxis_title=\"Model Type\",\n",
|
|
486
|
+
" yaxis_title=\"Max Features\",\n",
|
|
487
|
+
" yaxis_range=[0, max_val * 1.15], # Add 15% headroom for labels\n",
|
|
488
|
+
" template='plotly_white',\n",
|
|
489
|
+
" height=400,\n",
|
|
490
|
+
" showlegend=False,\n",
|
|
491
|
+
" )\n",
|
|
492
|
+
" \n",
|
|
493
|
+
" display_figure(fig)\n",
|
|
494
|
+
" \n",
|
|
495
|
+
" print(f\"\\n🎯 RECOMMENDED MODEL TYPE: {guidance.recommended_model_type.replace('_', ' ').title()}\")\n",
|
|
496
|
+
" \n",
|
|
497
|
+
" print(\"\\n📋 MODEL-SPECIFIC RECOMMENDATIONS:\")\n",
|
|
498
|
+
" for rec in guidance.model_recommendations:\n",
|
|
499
|
+
" print(f\" • {rec}\")\n",
|
|
500
|
+
" \n",
|
|
501
|
+
" print(\"\\n💡 GENERAL GUIDANCE:\")\n",
|
|
502
|
+
" for rec in guidance.recommendations:\n",
|
|
503
|
+
" print(f\" {rec}\")\n",
|
|
504
|
+
" \n",
|
|
505
|
+
" # Summary table\n",
|
|
506
|
+
" print(\"\\n\" + \"-\" * 70)\n",
|
|
507
|
+
" print(\"FEATURE BUDGET SUMMARY:\")\n",
|
|
508
|
+
" print(\"-\" * 70)\n",
|
|
509
|
+
" summary_data = {\n",
|
|
510
|
+
" \"Model Type\": [\"Linear (no regularization)\", \"Regularized (L1/L2)\", \"Tree-based\"],\n",
|
|
511
|
+
" \"Max Features\": [guidance.max_features_linear, guidance.max_features_regularized, guidance.max_features_tree],\n",
|
|
512
|
+
" \"Current\": [current_features] * 3,\n",
|
|
513
|
+
" \"Status\": [\n",
|
|
514
|
+
" \"✅ OK\" if guidance.max_features_linear >= current_features else \"⚠️ Reduce\",\n",
|
|
515
|
+
" \"✅ OK\" if guidance.max_features_regularized >= current_features else \"⚠️ Reduce\", \n",
|
|
516
|
+
" \"✅ OK\" if guidance.max_features_tree >= current_features else \"⚠️ Reduce\"\n",
|
|
517
|
+
" ]\n",
|
|
518
|
+
" }\n",
|
|
519
|
+
" display(pd.DataFrame(summary_data))\n",
|
|
520
|
+
" \n",
|
|
521
|
+
" # Persist model type recommendation to registry\n",
|
|
522
|
+
" registry.add_bronze_model_type(\n",
|
|
523
|
+
" model_type=guidance.recommended_model_type,\n",
|
|
524
|
+
" max_features_linear=guidance.max_features_linear,\n",
|
|
525
|
+
" max_features_regularized=guidance.max_features_regularized,\n",
|
|
526
|
+
" max_features_tree=guidance.max_features_tree,\n",
|
|
527
|
+
" rationale=f\"Recommended: {guidance.recommended_model_type}\",\n",
|
|
528
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
529
|
+
" )\n",
|
|
530
|
+
" print(f\"\\n✅ Persisted model type recommendation to registry: {guidance.recommended_model_type}\")"
|
|
531
|
+
]
|
|
532
|
+
},
|
|
533
|
+
{
|
|
534
|
+
"cell_type": "markdown",
|
|
535
|
+
"id": "60d49437",
|
|
536
|
+
"metadata": {
|
|
537
|
+
"papermill": {
|
|
538
|
+
"duration": 0.004153,
|
|
539
|
+
"end_time": "2026-02-02T13:03:29.604467",
|
|
540
|
+
"exception": false,
|
|
541
|
+
"start_time": "2026-02-02T13:03:29.600314",
|
|
542
|
+
"status": "completed"
|
|
543
|
+
},
|
|
544
|
+
"tags": []
|
|
545
|
+
},
|
|
546
|
+
"source": [
|
|
547
|
+
"### 6.3.2 Segment-Specific Capacity (for Multi-Model Strategy)\n",
|
|
548
|
+
"\n",
|
|
549
|
+
"When considering **separate models per customer segment**, each segment must have sufficient data to support the feature set. This analysis shows whether segmented modeling is viable.\n",
|
|
550
|
+
"\n",
|
|
551
|
+
"**📖 Single Model vs Segment Models:**\n",
|
|
552
|
+
"\n",
|
|
553
|
+
"| Approach | When to Use | Pros | Cons |\n",
|
|
554
|
+
"|----------|------------|------|------|\n",
|
|
555
|
+
"| **Single Model** | Small data, uniform segments | More data per model, simpler | May miss segment-specific patterns |\n",
|
|
556
|
+
"| **Segment Models** | Large data, distinct segments | Tailored patterns | Need sufficient data per segment |\n",
|
|
557
|
+
"| **Hybrid** | Mixed segment sizes | Best of both | More complex to maintain |"
|
|
558
|
+
]
|
|
559
|
+
},
|
|
560
|
+
{
|
|
561
|
+
"cell_type": "code",
|
|
562
|
+
"execution_count": null,
|
|
563
|
+
"id": "1e953415",
|
|
564
|
+
"metadata": {
|
|
565
|
+
"execution": {
|
|
566
|
+
"iopub.execute_input": "2026-02-02T13:03:29.613878Z",
|
|
567
|
+
"iopub.status.busy": "2026-02-02T13:03:29.613754Z",
|
|
568
|
+
"iopub.status.idle": "2026-02-02T13:03:29.670977Z",
|
|
569
|
+
"shell.execute_reply": "2026-02-02T13:03:29.670274Z"
|
|
570
|
+
},
|
|
571
|
+
"papermill": {
|
|
572
|
+
"duration": 0.062878,
|
|
573
|
+
"end_time": "2026-02-02T13:03:29.671675",
|
|
574
|
+
"exception": false,
|
|
575
|
+
"start_time": "2026-02-02T13:03:29.608797",
|
|
576
|
+
"status": "completed"
|
|
577
|
+
},
|
|
578
|
+
"tags": []
|
|
579
|
+
},
|
|
580
|
+
"outputs": [],
|
|
581
|
+
"source": [
|
|
582
|
+
"# Segment Capacity Analysis\n",
|
|
583
|
+
"categorical_cols = [\n",
|
|
584
|
+
" name for name, col in findings.columns.items()\n",
|
|
585
|
+
" if col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]\n",
|
|
586
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
587
|
+
"]\n",
|
|
588
|
+
"\n",
|
|
589
|
+
"print(\"=\" * 70)\n",
|
|
590
|
+
"print(\"SEGMENT CAPACITY ANALYSIS\")\n",
|
|
591
|
+
"print(\"=\" * 70)\n",
|
|
592
|
+
"\n",
|
|
593
|
+
"if findings.target_column and categorical_cols and 'numeric_features' in dir():\n",
|
|
594
|
+
" # Analyze the first categorical column as potential segment\n",
|
|
595
|
+
" segment_col = categorical_cols[0]\n",
|
|
596
|
+
" \n",
|
|
597
|
+
" print(f\"\\n📊 Analyzing segments by: {segment_col}\")\n",
|
|
598
|
+
" print(f\" Features to evaluate: {len(numeric_features)}\")\n",
|
|
599
|
+
" \n",
|
|
600
|
+
" segment_result = capacity_analyzer.analyze_segment_capacity(\n",
|
|
601
|
+
" df,\n",
|
|
602
|
+
" feature_cols=numeric_features,\n",
|
|
603
|
+
" target_col=findings.target_column,\n",
|
|
604
|
+
" segment_col=segment_col,\n",
|
|
605
|
+
" )\n",
|
|
606
|
+
" \n",
|
|
607
|
+
" print(f\"\\n🎯 RECOMMENDED STRATEGY: {segment_result.recommended_strategy.replace('_', ' ').title()}\")\n",
|
|
608
|
+
" print(f\" Reason: {segment_result.strategy_reason}\")\n",
|
|
609
|
+
" \n",
|
|
610
|
+
" # Segment details table\n",
|
|
611
|
+
" segment_data = []\n",
|
|
612
|
+
" for seg_name, cap in segment_result.segment_capacities.items():\n",
|
|
613
|
+
" segment_data.append({\n",
|
|
614
|
+
" \"Segment\": seg_name,\n",
|
|
615
|
+
" \"Samples\": cap.total_samples,\n",
|
|
616
|
+
" \"Minority Events\": cap.minority_class_samples,\n",
|
|
617
|
+
" \"EPV\": f\"{cap.events_per_variable:.1f}\",\n",
|
|
618
|
+
" \"Max Features (EPV=10)\": cap.recommended_features_moderate,\n",
|
|
619
|
+
" \"Status\": cap.capacity_status.title()\n",
|
|
620
|
+
" })\n",
|
|
621
|
+
" \n",
|
|
622
|
+
" segment_df = pd.DataFrame(segment_data)\n",
|
|
623
|
+
" segment_df = segment_df.sort_values(\"Samples\", ascending=False)\n",
|
|
624
|
+
" display(segment_df)\n",
|
|
625
|
+
" \n",
|
|
626
|
+
" # Visualization\n",
|
|
627
|
+
" fig = go.Figure()\n",
|
|
628
|
+
" \n",
|
|
629
|
+
" max_events = 0\n",
|
|
630
|
+
" for seg_name, cap in segment_result.segment_capacities.items():\n",
|
|
631
|
+
" color = \"#2ecc71\" if cap.capacity_status == \"adequate\" else \"#f39c12\" if cap.capacity_status == \"limited\" else \"#e74c3c\"\n",
|
|
632
|
+
" fig.add_trace(go.Bar(\n",
|
|
633
|
+
" name=seg_name,\n",
|
|
634
|
+
" x=[seg_name],\n",
|
|
635
|
+
" y=[cap.minority_class_samples],\n",
|
|
636
|
+
" marker_color=color,\n",
|
|
637
|
+
" text=[f\"EPV={cap.events_per_variable:.1f}\"],\n",
|
|
638
|
+
" textposition='outside'\n",
|
|
639
|
+
" ))\n",
|
|
640
|
+
" max_events = max(max_events, cap.minority_class_samples)\n",
|
|
641
|
+
" \n",
|
|
642
|
+
" # Add threshold line\n",
|
|
643
|
+
" threshold_events = len(numeric_features) * 10 # EPV=10 threshold\n",
|
|
644
|
+
" fig.add_hline(\n",
|
|
645
|
+
" y=threshold_events,\n",
|
|
646
|
+
" line_dash=\"dash\",\n",
|
|
647
|
+
" line_color=\"#3498db\",\n",
|
|
648
|
+
" annotation_text=f\"Min events for {len(numeric_features)} features (EPV=10)\",\n",
|
|
649
|
+
" annotation_position=\"right\"\n",
|
|
650
|
+
" )\n",
|
|
651
|
+
" \n",
|
|
652
|
+
" # Calculate y-axis range to fit labels\n",
|
|
653
|
+
" y_max = max(max_events, threshold_events)\n",
|
|
654
|
+
" fig.update_layout(\n",
|
|
655
|
+
" title=f\"Minority Class Events by Segment ({segment_col})\",\n",
|
|
656
|
+
" xaxis_title=\"Segment\",\n",
|
|
657
|
+
" yaxis_title=\"Minority Class Events\",\n",
|
|
658
|
+
" yaxis_range=[0, y_max * 1.15], # Add 15% headroom for labels\n",
|
|
659
|
+
" template='plotly_white',\n",
|
|
660
|
+
" height=400,\n",
|
|
661
|
+
" showlegend=False,\n",
|
|
662
|
+
" )\n",
|
|
663
|
+
" display_figure(fig)\n",
|
|
664
|
+
" \n",
|
|
665
|
+
" print(\"\\n📋 SEGMENT RECOMMENDATIONS:\")\n",
|
|
666
|
+
" for rec in segment_result.recommendations:\n",
|
|
667
|
+
" print(f\" {rec}\")\n",
|
|
668
|
+
" \n",
|
|
669
|
+
" if segment_result.viable_segments:\n",
|
|
670
|
+
" print(f\"\\n ✅ Viable for separate models: {', '.join(segment_result.viable_segments)}\")\n",
|
|
671
|
+
" if segment_result.insufficient_segments:\n",
|
|
672
|
+
" print(f\" ⚠️ Insufficient data: {', '.join(segment_result.insufficient_segments)}\")\n",
|
|
673
|
+
" \n",
|
|
674
|
+
" # Store in findings\n",
|
|
675
|
+
" findings.metadata[\"segment_capacity\"] = segment_result.to_dict()\n",
|
|
676
|
+
"else:\n",
|
|
677
|
+
" print(\"\\n⚠️ No categorical columns available for segment analysis.\")\n",
|
|
678
|
+
" print(\" Segment capacity analysis requires at least one categorical column.\")"
|
|
679
|
+
]
|
|
680
|
+
},
|
|
681
|
+
{
|
|
682
|
+
"cell_type": "markdown",
|
|
683
|
+
"id": "1ddaaacb",
|
|
684
|
+
"metadata": {
|
|
685
|
+
"papermill": {
|
|
686
|
+
"duration": 0.006377,
|
|
687
|
+
"end_time": "2026-02-02T13:03:29.684180",
|
|
688
|
+
"exception": false,
|
|
689
|
+
"start_time": "2026-02-02T13:03:29.677803",
|
|
690
|
+
"status": "completed"
|
|
691
|
+
},
|
|
692
|
+
"tags": []
|
|
693
|
+
},
|
|
694
|
+
"source": [
|
|
695
|
+
"### 6.3.3 Feature Capacity Action Items\n",
|
|
696
|
+
"\n",
|
|
697
|
+
"Based on the analysis above, here are the key considerations for feature engineering:"
|
|
698
|
+
]
|
|
699
|
+
},
|
|
700
|
+
{
|
|
701
|
+
"cell_type": "code",
|
|
702
|
+
"execution_count": null,
|
|
703
|
+
"id": "0546a0e2",
|
|
704
|
+
"metadata": {
|
|
705
|
+
"execution": {
|
|
706
|
+
"iopub.execute_input": "2026-02-02T13:03:29.697042Z",
|
|
707
|
+
"iopub.status.busy": "2026-02-02T13:03:29.696909Z",
|
|
708
|
+
"iopub.status.idle": "2026-02-02T13:03:29.702192Z",
|
|
709
|
+
"shell.execute_reply": "2026-02-02T13:03:29.701745Z"
|
|
710
|
+
},
|
|
711
|
+
"papermill": {
|
|
712
|
+
"duration": 0.012305,
|
|
713
|
+
"end_time": "2026-02-02T13:03:29.702706",
|
|
714
|
+
"exception": false,
|
|
715
|
+
"start_time": "2026-02-02T13:03:29.690401",
|
|
716
|
+
"status": "completed"
|
|
717
|
+
},
|
|
718
|
+
"tags": []
|
|
719
|
+
},
|
|
720
|
+
"outputs": [],
|
|
721
|
+
"source": [
|
|
722
|
+
"# Feature Capacity Action Items Summary\n",
|
|
723
|
+
"if findings.target_column and 'capacity_result' in dir():\n",
|
|
724
|
+
" print(\"=\" * 70)\n",
|
|
725
|
+
" print(\"FEATURE CAPACITY ACTION ITEMS\")\n",
|
|
726
|
+
" print(\"=\" * 70)\n",
|
|
727
|
+
" \n",
|
|
728
|
+
" print(\"\\n📋 BASED ON YOUR DATA CAPACITY:\")\n",
|
|
729
|
+
" \n",
|
|
730
|
+
" # Action items based on capacity status\n",
|
|
731
|
+
" if capacity_result.capacity_status == \"adequate\":\n",
|
|
732
|
+
" print(\"\\n✅ ADEQUATE CAPACITY - You have room to add features\")\n",
|
|
733
|
+
" print(f\" • Current features: {capacity_result.total_features}\")\n",
|
|
734
|
+
" print(f\" • Can add up to: {capacity_result.recommended_features_moderate - capacity_result.total_features} more features (EPV=10)\")\n",
|
|
735
|
+
" print(f\" • Consider: Creating derived features from datetime and categorical columns\")\n",
|
|
736
|
+
" elif capacity_result.capacity_status == \"limited\":\n",
|
|
737
|
+
" print(\"\\n⚠️ LIMITED CAPACITY - Be selective with new features\")\n",
|
|
738
|
+
" print(f\" • Current features: {capacity_result.total_features}\")\n",
|
|
739
|
+
" print(f\" • Recommended max: {capacity_result.recommended_features_moderate} features (EPV=10)\")\n",
|
|
740
|
+
" print(f\" • Action: Remove {max(0, capacity_result.total_features - capacity_result.recommended_features_moderate)} redundant features before adding new ones\")\n",
|
|
741
|
+
" print(f\" • Consider: Using regularization (L1/Lasso) if keeping all features\")\n",
|
|
742
|
+
" else:\n",
|
|
743
|
+
" print(\"\\n🔴 INADEQUATE CAPACITY - Reduce features or get more data\")\n",
|
|
744
|
+
" print(f\" • Current features: {capacity_result.total_features}\")\n",
|
|
745
|
+
" print(f\" • Recommended max: {capacity_result.recommended_features_moderate} features (EPV=10)\")\n",
|
|
746
|
+
" print(f\" • CRITICAL: Reduce to {capacity_result.recommended_features_conservative} features for stable estimates\")\n",
|
|
747
|
+
" print(f\" • Options: (1) Feature selection, (2) PCA, (3) Collect more data\")\n",
|
|
748
|
+
" \n",
|
|
749
|
+
" # Redundancy recommendations\n",
|
|
750
|
+
" if capacity_result.effective_features_result and capacity_result.effective_features_result.redundant_features:\n",
|
|
751
|
+
" redundant = capacity_result.effective_features_result.redundant_features\n",
|
|
752
|
+
" print(f\"\\n🔄 REDUNDANT FEATURES TO CONSIDER REMOVING:\")\n",
|
|
753
|
+
" print(f\" These features are highly correlated with others and add little new information:\")\n",
|
|
754
|
+
" for feat in redundant[:5]:\n",
|
|
755
|
+
" print(f\" • {feat}\")\n",
|
|
756
|
+
" if len(redundant) > 5:\n",
|
|
757
|
+
" print(f\" ... and {len(redundant) - 5} more\")\n",
|
|
758
|
+
" \n",
|
|
759
|
+
" # New feature budget\n",
|
|
760
|
+
" print(\"\\n💰 FEATURE BUDGET FOR NEW FEATURES:\")\n",
|
|
761
|
+
" remaining_budget = capacity_result.recommended_features_moderate - capacity_result.total_features\n",
|
|
762
|
+
" if remaining_budget > 0:\n",
|
|
763
|
+
" print(f\" You can safely add {remaining_budget} new features\")\n",
|
|
764
|
+
" print(\" Prioritize:\")\n",
|
|
765
|
+
" print(\" • Recency features (days_since_last_activity)\")\n",
|
|
766
|
+
" print(\" • Tenure features (days_since_created)\")\n",
|
|
767
|
+
" print(\" • Engagement composites (email_engagement_score)\")\n",
|
|
768
|
+
" else:\n",
|
|
769
|
+
" print(f\" ⚠️ At or over capacity. Remove {-remaining_budget} features before adding new ones.\")\n",
|
|
770
|
+
" \n",
|
|
771
|
+
" # Model selection summary\n",
|
|
772
|
+
" print(\"\\n🎯 RECOMMENDED MODELING APPROACH:\")\n",
|
|
773
|
+
" if capacity_result.complexity_guidance:\n",
|
|
774
|
+
" print(f\" Model type: {capacity_result.complexity_guidance.recommended_model_type.replace('_', ' ').title()}\")\n",
|
|
775
|
+
" if \"regularized\" in capacity_result.complexity_guidance.recommended_model_type:\n",
|
|
776
|
+
" print(\" → Use Lasso (L1) for automatic feature selection\")\n",
|
|
777
|
+
" print(\" → Use Ridge (L2) if you want to keep all features\")\n",
|
|
778
|
+
" elif \"tree\" in capacity_result.complexity_guidance.recommended_model_type:\n",
|
|
779
|
+
" print(\" → Random Forest or XGBoost recommended\")\n",
|
|
780
|
+
" print(\" → Trees handle correlated features naturally\")\n",
|
|
781
|
+
" \n",
|
|
782
|
+
" print(\"\\n\" + \"=\" * 70)"
|
|
783
|
+
]
|
|
784
|
+
},
|
|
785
|
+
{
|
|
786
|
+
"cell_type": "markdown",
|
|
787
|
+
"id": "fa2dec27",
|
|
788
|
+
"metadata": {
|
|
789
|
+
"papermill": {
|
|
790
|
+
"duration": 0.006408,
|
|
791
|
+
"end_time": "2026-02-02T13:03:29.715066",
|
|
792
|
+
"exception": false,
|
|
793
|
+
"start_time": "2026-02-02T13:03:29.708658",
|
|
794
|
+
"status": "completed"
|
|
795
|
+
},
|
|
796
|
+
"tags": []
|
|
797
|
+
},
|
|
798
|
+
"source": [
|
|
799
|
+
"### 6.3.4 Feature Availability Issues\n",
|
|
800
|
+
"\n",
|
|
801
|
+
"Features with tracking changes (new systems, retired systems) require special handling before modeling."
|
|
802
|
+
]
|
|
803
|
+
},
|
|
804
|
+
{
|
|
805
|
+
"cell_type": "code",
|
|
806
|
+
"execution_count": null,
|
|
807
|
+
"id": "1dca7374",
|
|
808
|
+
"metadata": {
|
|
809
|
+
"execution": {
|
|
810
|
+
"iopub.execute_input": "2026-02-02T13:03:29.728363Z",
|
|
811
|
+
"iopub.status.busy": "2026-02-02T13:03:29.728251Z",
|
|
812
|
+
"iopub.status.idle": "2026-02-02T13:03:29.731861Z",
|
|
813
|
+
"shell.execute_reply": "2026-02-02T13:03:29.731393Z"
|
|
814
|
+
},
|
|
815
|
+
"papermill": {
|
|
816
|
+
"duration": 0.010902,
|
|
817
|
+
"end_time": "2026-02-02T13:03:29.732323",
|
|
818
|
+
"exception": false,
|
|
819
|
+
"start_time": "2026-02-02T13:03:29.721421",
|
|
820
|
+
"status": "completed"
|
|
821
|
+
},
|
|
822
|
+
"tags": []
|
|
823
|
+
},
|
|
824
|
+
"outputs": [],
|
|
825
|
+
"source": [
|
|
826
|
+
"# Feature Availability Analysis\n",
|
|
827
|
+
"from customer_retention.stages.features.feature_selector import FeatureSelector\n",
|
|
828
|
+
"\n",
|
|
829
|
+
"print(\"=\" * 70)\n",
|
|
830
|
+
"print(\"FEATURE AVAILABILITY ANALYSIS\")\n",
|
|
831
|
+
"print(\"=\" * 70)\n",
|
|
832
|
+
"\n",
|
|
833
|
+
"unavailable_features = []\n",
|
|
834
|
+
"if findings.has_availability_issues:\n",
|
|
835
|
+
" selector = FeatureSelector(target_column=findings.target_column)\n",
|
|
836
|
+
" availability_recs = selector.get_availability_recommendations(findings.feature_availability)\n",
|
|
837
|
+
" unavailable_features = [rec.column for rec in availability_recs]\n",
|
|
838
|
+
" \n",
|
|
839
|
+
" print(f\"\\n⚠️ {len(availability_recs)} feature(s) have tracking changes:\\n\")\n",
|
|
840
|
+
" \n",
|
|
841
|
+
" for rec in availability_recs:\n",
|
|
842
|
+
" print(f\"📌 {rec.column}\")\n",
|
|
843
|
+
" print(f\" Issue: {rec.issue_type} | Coverage: {rec.coverage_pct:.0f}%\")\n",
|
|
844
|
+
" print(f\" Available: {rec.first_valid_date} → {rec.last_valid_date}\")\n",
|
|
845
|
+
" print(f\"\\n Remediation options:\")\n",
|
|
846
|
+
" for opt in rec.options:\n",
|
|
847
|
+
" marker = \"→\" if opt.get(\"recommended\") else \" \"\n",
|
|
848
|
+
" print(f\" {marker} [{opt['type']}] {opt['description']}\")\n",
|
|
849
|
+
" print()\n",
|
|
850
|
+
" \n",
|
|
851
|
+
" print(\"-\" * 70)\n",
|
|
852
|
+
" print(\"RECOMMENDED ACTION: Remove unavailable features before modeling\")\n",
|
|
853
|
+
" print(\"-\" * 70)\n",
|
|
854
|
+
" print(f\"\\nFeatures to exclude: {', '.join(unavailable_features)}\")\n",
|
|
855
|
+
" print(\"\\nAlternative approaches (require additional implementation):\")\n",
|
|
856
|
+
" print(\" • segment_by_cohort: Train separate models for different time periods\")\n",
|
|
857
|
+
" print(\" • add_indicator: Create availability flags, impute missing values\")\n",
|
|
858
|
+
" print(\" • filter_window: Restrict training data to feature's available period\")\n",
|
|
859
|
+
" \n",
|
|
860
|
+
" findings.metadata[\"unavailable_features\"] = unavailable_features\n",
|
|
861
|
+
" findings.metadata[\"availability_action\"] = \"exclude\"\n",
|
|
862
|
+
"else:\n",
|
|
863
|
+
" print(\"\\n✅ All features have full temporal coverage - no availability issues.\")"
|
|
864
|
+
]
|
|
865
|
+
},
|
|
866
|
+
{
|
|
867
|
+
"cell_type": "markdown",
|
|
868
|
+
"id": "f8a17e0d",
|
|
869
|
+
"metadata": {
|
|
870
|
+
"papermill": {
|
|
871
|
+
"duration": 0.006457,
|
|
872
|
+
"end_time": "2026-02-02T13:03:29.745087",
|
|
873
|
+
"exception": false,
|
|
874
|
+
"start_time": "2026-02-02T13:03:29.738630",
|
|
875
|
+
"status": "completed"
|
|
876
|
+
},
|
|
877
|
+
"tags": []
|
|
878
|
+
},
|
|
879
|
+
"source": [
|
|
880
|
+
"## 6.4 Datetime Feature Opportunities"
|
|
881
|
+
]
|
|
882
|
+
},
|
|
883
|
+
{
|
|
884
|
+
"cell_type": "code",
|
|
885
|
+
"execution_count": null,
|
|
886
|
+
"id": "22750047",
|
|
887
|
+
"metadata": {
|
|
888
|
+
"execution": {
|
|
889
|
+
"iopub.execute_input": "2026-02-02T13:03:29.758503Z",
|
|
890
|
+
"iopub.status.busy": "2026-02-02T13:03:29.758383Z",
|
|
891
|
+
"iopub.status.idle": "2026-02-02T13:03:29.761339Z",
|
|
892
|
+
"shell.execute_reply": "2026-02-02T13:03:29.760856Z"
|
|
893
|
+
},
|
|
894
|
+
"papermill": {
|
|
895
|
+
"duration": 0.011094,
|
|
896
|
+
"end_time": "2026-02-02T13:03:29.762065",
|
|
897
|
+
"exception": false,
|
|
898
|
+
"start_time": "2026-02-02T13:03:29.750971",
|
|
899
|
+
"status": "completed"
|
|
900
|
+
},
|
|
901
|
+
"tags": []
|
|
902
|
+
},
|
|
903
|
+
"outputs": [],
|
|
904
|
+
"source": [
|
|
905
|
+
"datetime_cols = [\n",
|
|
906
|
+
" name for name, col in findings.columns.items()\n",
|
|
907
|
+
" if col.inferred_type == ColumnType.DATETIME\n",
|
|
908
|
+
"]\n",
|
|
909
|
+
"\n",
|
|
910
|
+
"if datetime_cols:\n",
|
|
911
|
+
" print(\"Datetime Feature Opportunities:\")\n",
|
|
912
|
+
" print(\"=\"*50)\n",
|
|
913
|
+
" for col in datetime_cols:\n",
|
|
914
|
+
" print(f\"\\n{col}:\")\n",
|
|
915
|
+
" print(f\" - {col}_year: Extract year\")\n",
|
|
916
|
+
" print(f\" - {col}_month: Extract month\")\n",
|
|
917
|
+
" print(f\" - {col}_day: Extract day of month\")\n",
|
|
918
|
+
" print(f\" - {col}_dayofweek: Extract day of week (0-6)\")\n",
|
|
919
|
+
" print(f\" - {col}_is_weekend: Is weekend flag\")\n",
|
|
920
|
+
" print(f\" - days_since_{col}: Days since date\")\n",
|
|
921
|
+
"else:\n",
|
|
922
|
+
" print(\"No datetime columns found.\")"
|
|
923
|
+
]
|
|
924
|
+
},
|
|
925
|
+
{
|
|
926
|
+
"cell_type": "markdown",
|
|
927
|
+
"id": "2bf1a888",
|
|
928
|
+
"metadata": {
|
|
929
|
+
"papermill": {
|
|
930
|
+
"duration": 0.006096,
|
|
931
|
+
"end_time": "2026-02-02T13:03:29.774279",
|
|
932
|
+
"exception": false,
|
|
933
|
+
"start_time": "2026-02-02T13:03:29.768183",
|
|
934
|
+
"status": "completed"
|
|
935
|
+
},
|
|
936
|
+
"tags": []
|
|
937
|
+
},
|
|
938
|
+
"source": [
|
|
939
|
+
"## 6.5 Business-Driven Derived Features\n",
|
|
940
|
+
"\n",
|
|
941
|
+
"These features are based on domain knowledge from the reference analysis (my_take Phase 1).\n",
|
|
942
|
+
"\n",
|
|
943
|
+
"**📖 Key Derived Features:**\n",
|
|
944
|
+
"- **Tenure Days**: Days from account creation to analysis date\n",
|
|
945
|
+
"- **Days Since Last Order**: Recency indicator (critical for churn)\n",
|
|
946
|
+
"- **Active Period Days**: Duration of customer activity\n",
|
|
947
|
+
"- **Email Engagement Score**: Composite of open rate and click rate\n",
|
|
948
|
+
"- **Click-to-Open Ratio**: Quality of email engagement\n",
|
|
949
|
+
"- **Service Adoption Score**: Sum of service flags (paperless, refill, doorstep)"
|
|
950
|
+
]
|
|
951
|
+
},
|
|
952
|
+
{
|
|
953
|
+
"cell_type": "code",
|
|
954
|
+
"execution_count": null,
|
|
955
|
+
"id": "b4c81f6d",
|
|
956
|
+
"metadata": {
|
|
957
|
+
"execution": {
|
|
958
|
+
"iopub.execute_input": "2026-02-02T13:03:29.787759Z",
|
|
959
|
+
"iopub.status.busy": "2026-02-02T13:03:29.787565Z",
|
|
960
|
+
"iopub.status.idle": "2026-02-02T13:03:29.797850Z",
|
|
961
|
+
"shell.execute_reply": "2026-02-02T13:03:29.797421Z"
|
|
962
|
+
},
|
|
963
|
+
"papermill": {
|
|
964
|
+
"duration": 0.017788,
|
|
965
|
+
"end_time": "2026-02-02T13:03:29.798525",
|
|
966
|
+
"exception": false,
|
|
967
|
+
"start_time": "2026-02-02T13:03:29.780737",
|
|
968
|
+
"status": "completed"
|
|
969
|
+
},
|
|
970
|
+
"tags": []
|
|
971
|
+
},
|
|
972
|
+
"outputs": [],
|
|
973
|
+
"source": [
|
|
974
|
+
"print(\"=\" * 70)\n",
|
|
975
|
+
"print(\"CREATING DERIVED FEATURES\")\n",
|
|
976
|
+
"print(\"=\" * 70)\n",
|
|
977
|
+
"\n",
|
|
978
|
+
"segmenter = CustomerSegmenter()\n",
|
|
979
|
+
"df_features = df.copy()\n",
|
|
980
|
+
"\n",
|
|
981
|
+
"datetime_cols = [name for name, col in findings.columns.items() \n",
|
|
982
|
+
" if col.inferred_type == ColumnType.DATETIME\n",
|
|
983
|
+
" and name not in TEMPORAL_METADATA_COLS]\n",
|
|
984
|
+
"binary_cols = [name for name, col in findings.columns.items() \n",
|
|
985
|
+
" if col.inferred_type == ColumnType.BINARY\n",
|
|
986
|
+
" and name not in TEMPORAL_METADATA_COLS]\n",
|
|
987
|
+
"numeric_cols = [name for name, col in findings.columns.items() \n",
|
|
988
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]]\n",
|
|
989
|
+
"\n",
|
|
990
|
+
"for col in datetime_cols:\n",
|
|
991
|
+
" df_features[col] = pd.to_datetime(df_features[col], errors='coerce', format='mixed')\n",
|
|
992
|
+
"\n",
|
|
993
|
+
"reference_date = pd.Timestamp.now()\n",
|
|
994
|
+
"if datetime_cols:\n",
|
|
995
|
+
" last_dates = [df_features[col].max() for col in datetime_cols if df_features[col].notna().any()]\n",
|
|
996
|
+
" if last_dates:\n",
|
|
997
|
+
" reference_date = max(last_dates)\n",
|
|
998
|
+
"print(f\"\\nReference date: {reference_date}\")\n",
|
|
999
|
+
"\n",
|
|
1000
|
+
"print(\"\\n📅 TIME-BASED FEATURES:\")\n",
|
|
1001
|
+
"created_cols = [c for c in datetime_cols if 'creat' in c.lower() or 'signup' in c.lower() or 'register' in c.lower()]\n",
|
|
1002
|
+
"if created_cols:\n",
|
|
1003
|
+
" created_col = created_cols[0]\n",
|
|
1004
|
+
" df_features = segmenter.create_tenure_features(df_features, created_column=created_col, reference_date=reference_date)\n",
|
|
1005
|
+
" print(f\" ✓ tenure_days from {created_col}\")\n",
|
|
1006
|
+
" registry.add_silver_derived(\n",
|
|
1007
|
+
" column=\"tenure_days\",\n",
|
|
1008
|
+
" expression=f\"(reference_date - {created_col}).days\",\n",
|
|
1009
|
+
" feature_type=\"tenure\",\n",
|
|
1010
|
+
" rationale=f\"Customer tenure in days from {created_col}\",\n",
|
|
1011
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1012
|
+
" )\n",
|
|
1013
|
+
"\n",
|
|
1014
|
+
"activity_cols = [c for c in datetime_cols if 'last' in c.lower() or 'recent' in c.lower()]\n",
|
|
1015
|
+
"if activity_cols:\n",
|
|
1016
|
+
" activity_col = activity_cols[0]\n",
|
|
1017
|
+
" df_features = segmenter.create_recency_features(df_features, last_activity_column=activity_col, \n",
|
|
1018
|
+
" reference_date=reference_date, output_column='days_since_last_activity')\n",
|
|
1019
|
+
" print(f\" ✓ days_since_last_activity from {activity_col}\")\n",
|
|
1020
|
+
" registry.add_silver_derived(\n",
|
|
1021
|
+
" column=\"days_since_last_activity\",\n",
|
|
1022
|
+
" expression=f\"(reference_date - {activity_col}).days\",\n",
|
|
1023
|
+
" feature_type=\"recency\",\n",
|
|
1024
|
+
" rationale=f\"Days since last activity from {activity_col}\",\n",
|
|
1025
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1026
|
+
" )\n",
|
|
1027
|
+
"\n",
|
|
1028
|
+
"print(\"\\n📧 ENGAGEMENT FEATURES:\")\n",
|
|
1029
|
+
"rate_cols = [c for c in numeric_cols if 'rate' in c.lower() or 'pct' in c.lower() or 'percent' in c.lower()]\n",
|
|
1030
|
+
"open_rate_cols = [c for c in rate_cols if 'open' in c.lower()]\n",
|
|
1031
|
+
"click_rate_cols = [c for c in rate_cols if 'click' in c.lower()]\n",
|
|
1032
|
+
"\n",
|
|
1033
|
+
"if open_rate_cols and click_rate_cols:\n",
|
|
1034
|
+
" open_col, click_col = open_rate_cols[0], click_rate_cols[0]\n",
|
|
1035
|
+
" df_features = segmenter.create_engagement_score(df_features, open_rate_column=open_col, \n",
|
|
1036
|
+
" click_rate_column=click_col, output_column='email_engagement_score')\n",
|
|
1037
|
+
" print(f\" ✓ email_engagement_score from {open_col}, {click_col}\")\n",
|
|
1038
|
+
" registry.add_silver_derived(\n",
|
|
1039
|
+
" column=\"email_engagement_score\",\n",
|
|
1040
|
+
" expression=f\"0.6 * {open_col} + 0.4 * {click_col}\",\n",
|
|
1041
|
+
" feature_type=\"composite\",\n",
|
|
1042
|
+
" rationale=f\"Weighted engagement score from {open_col} and {click_col}\",\n",
|
|
1043
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1044
|
+
" )\n",
|
|
1045
|
+
" \n",
|
|
1046
|
+
" df_features['click_to_open_rate'] = np.where(df_features[open_col] > 0, df_features[click_col] / df_features[open_col], 0)\n",
|
|
1047
|
+
" print(f\" ✓ click_to_open_rate\")\n",
|
|
1048
|
+
" registry.add_silver_ratio(\n",
|
|
1049
|
+
" column=\"click_to_open_rate\",\n",
|
|
1050
|
+
" numerator=click_col,\n",
|
|
1051
|
+
" denominator=open_col,\n",
|
|
1052
|
+
" rationale=f\"Click-to-open ratio: {click_col} / {open_col}\",\n",
|
|
1053
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1054
|
+
" )\n",
|
|
1055
|
+
"\n",
|
|
1056
|
+
"print(\"\\n🔧 SERVICE ADOPTION:\")\n",
|
|
1057
|
+
"if binary_cols:\n",
|
|
1058
|
+
" service_binary = [c for c in binary_cols if c != findings.target_column]\n",
|
|
1059
|
+
" if service_binary:\n",
|
|
1060
|
+
" df_features['service_adoption_score'] = df_features[service_binary].sum(axis=1)\n",
|
|
1061
|
+
" print(f\" ✓ service_adoption_score from {service_binary}\")\n",
|
|
1062
|
+
" registry.add_silver_derived(\n",
|
|
1063
|
+
" column=\"service_adoption_score\",\n",
|
|
1064
|
+
" expression=f\"sum([{', '.join(service_binary)}])\",\n",
|
|
1065
|
+
" feature_type=\"composite\",\n",
|
|
1066
|
+
" rationale=f\"Service adoption count from {len(service_binary)} binary flags\",\n",
|
|
1067
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1068
|
+
" )\n",
|
|
1069
|
+
"\n",
|
|
1070
|
+
"print(\"\\n💰 VALUE FEATURES:\")\n",
|
|
1071
|
+
"value_cols = [c for c in numeric_cols if 'order' in c.lower() or 'amount' in c.lower() or 'value' in c.lower() or 'avg' in c.lower()]\n",
|
|
1072
|
+
"freq_cols = [c for c in numeric_cols if 'freq' in c.lower() or 'count' in c.lower()]\n",
|
|
1073
|
+
"if value_cols and freq_cols:\n",
|
|
1074
|
+
" df_features['value_frequency_product'] = df_features[value_cols[0]] * df_features[freq_cols[0]]\n",
|
|
1075
|
+
" print(f\" ✓ value_frequency_product from {value_cols[0]}, {freq_cols[0]}\")\n",
|
|
1076
|
+
" registry.add_silver_interaction(\n",
|
|
1077
|
+
" column=\"value_frequency_product\",\n",
|
|
1078
|
+
" features=[value_cols[0], freq_cols[0]],\n",
|
|
1079
|
+
" rationale=f\"Value-frequency interaction: {value_cols[0]} × {freq_cols[0]}\",\n",
|
|
1080
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1081
|
+
" )\n",
|
|
1082
|
+
"\n",
|
|
1083
|
+
"new_cols = len(df_features.columns) - len(df.columns)\n",
|
|
1084
|
+
"print(f\"\\n✓ Created {new_cols} new features (total: {len(df_features.columns)})\")\n",
|
|
1085
|
+
"print(f\"✅ Persisted {len([c for c in ['tenure_days', 'days_since_last_activity', 'email_engagement_score', 'click_to_open_rate', 'service_adoption_score', 'value_frequency_product'] if c in df_features.columns])} derived feature recommendations to registry\")"
|
|
1086
|
+
]
|
|
1087
|
+
},
|
|
1088
|
+
{
|
|
1089
|
+
"cell_type": "markdown",
|
|
1090
|
+
"id": "534556db",
|
|
1091
|
+
"metadata": {
|
|
1092
|
+
"papermill": {
|
|
1093
|
+
"duration": 0.006703,
|
|
1094
|
+
"end_time": "2026-02-02T13:03:29.811588",
|
|
1095
|
+
"exception": false,
|
|
1096
|
+
"start_time": "2026-02-02T13:03:29.804885",
|
|
1097
|
+
"status": "completed"
|
|
1098
|
+
},
|
|
1099
|
+
"tags": []
|
|
1100
|
+
},
|
|
1101
|
+
"source": [
|
|
1102
|
+
"## 6.6 Customer Segmentation Features\n",
|
|
1103
|
+
"\n",
|
|
1104
|
+
"Create business-meaningful segments for analysis and modeling.\n",
|
|
1105
|
+
"\n",
|
|
1106
|
+
"**📖 Segmentation Strategy:**\n",
|
|
1107
|
+
"- **Value Dimension**: High vs Low (based on avgorder median)\n",
|
|
1108
|
+
"- **Frequency Dimension**: Frequent vs Infrequent (based on ordfreq median)\n",
|
|
1109
|
+
"- **Recency Buckets**: Active, Recent, Lapsing, Dormant"
|
|
1110
|
+
]
|
|
1111
|
+
},
|
|
1112
|
+
{
|
|
1113
|
+
"cell_type": "code",
|
|
1114
|
+
"execution_count": null,
|
|
1115
|
+
"id": "fcd3b7d1",
|
|
1116
|
+
"metadata": {
|
|
1117
|
+
"execution": {
|
|
1118
|
+
"iopub.execute_input": "2026-02-02T13:03:29.824904Z",
|
|
1119
|
+
"iopub.status.busy": "2026-02-02T13:03:29.824788Z",
|
|
1120
|
+
"iopub.status.idle": "2026-02-02T13:03:29.830423Z",
|
|
1121
|
+
"shell.execute_reply": "2026-02-02T13:03:29.829634Z"
|
|
1122
|
+
},
|
|
1123
|
+
"papermill": {
|
|
1124
|
+
"duration": 0.013189,
|
|
1125
|
+
"end_time": "2026-02-02T13:03:29.831041",
|
|
1126
|
+
"exception": false,
|
|
1127
|
+
"start_time": "2026-02-02T13:03:29.817852",
|
|
1128
|
+
"status": "completed"
|
|
1129
|
+
},
|
|
1130
|
+
"tags": []
|
|
1131
|
+
},
|
|
1132
|
+
"outputs": [],
|
|
1133
|
+
"source": [
|
|
1134
|
+
"print(\"=\" * 70)\n",
|
|
1135
|
+
"print(\"CUSTOMER SEGMENTATION\")\n",
|
|
1136
|
+
"print(\"=\" * 70)\n",
|
|
1137
|
+
"\n",
|
|
1138
|
+
"print(\"\\n🎯 VALUE-FREQUENCY SEGMENTS:\")\n",
|
|
1139
|
+
"value_cols = [c for c in numeric_cols if 'order' in c.lower() or 'amount' in c.lower() or 'value' in c.lower() or 'avg' in c.lower()]\n",
|
|
1140
|
+
"freq_cols = [c for c in numeric_cols if 'freq' in c.lower() or 'count' in c.lower()]\n",
|
|
1141
|
+
"\n",
|
|
1142
|
+
"if value_cols and freq_cols:\n",
|
|
1143
|
+
" df_features, vf_result = segmenter.segment_by_value_frequency(\n",
|
|
1144
|
+
" df_features, value_column=value_cols[0], frequency_column=freq_cols[0])\n",
|
|
1145
|
+
" print(f\" Using {value_cols[0]} × {freq_cols[0]}\")\n",
|
|
1146
|
+
" for seg in vf_result.segments:\n",
|
|
1147
|
+
" print(f\" {seg.name}: {seg.count:,} ({seg.percentage:.1f}%)\")\n",
|
|
1148
|
+
"else:\n",
|
|
1149
|
+
" print(\" No suitable value/frequency columns found\")\n",
|
|
1150
|
+
"\n",
|
|
1151
|
+
"print(\"\\n📅 RECENCY SEGMENTS:\")\n",
|
|
1152
|
+
"if 'days_since_last_activity' in df_features.columns:\n",
|
|
1153
|
+
" df_features, recency_result = segmenter.segment_by_recency(df_features, days_since_column='days_since_last_activity')\n",
|
|
1154
|
+
" for seg in recency_result.segments:\n",
|
|
1155
|
+
" print(f\" {seg.name}: {seg.count:,} ({seg.percentage:.1f}%)\")\n",
|
|
1156
|
+
"else:\n",
|
|
1157
|
+
" print(\" No recency column available\")\n",
|
|
1158
|
+
"\n",
|
|
1159
|
+
"print(\"\\n📧 ENGAGEMENT SEGMENTS:\")\n",
|
|
1160
|
+
"if 'email_engagement_score' in df_features.columns:\n",
|
|
1161
|
+
" max_score = df_features['email_engagement_score'].max()\n",
|
|
1162
|
+
" if max_score > 0:\n",
|
|
1163
|
+
" df_features['engagement_normalized'] = df_features['email_engagement_score'] / max_score\n",
|
|
1164
|
+
" df_features, eng_result = segmenter.segment_by_engagement(df_features, engagement_column='engagement_normalized')\n",
|
|
1165
|
+
" for seg in eng_result.segments:\n",
|
|
1166
|
+
" print(f\" {seg.name}: {seg.count:,} ({seg.percentage:.1f}%)\")\n",
|
|
1167
|
+
" df_features = df_features.drop(columns=['engagement_normalized'])\n",
|
|
1168
|
+
"else:\n",
|
|
1169
|
+
" print(\" No engagement score available\")\n",
|
|
1170
|
+
"\n",
|
|
1171
|
+
"if 'customer_segment' in df_features.columns and findings.target_column and findings.target_column in df_features.columns:\n",
|
|
1172
|
+
" target = findings.target_column\n",
|
|
1173
|
+
" segment_retention = df_features.groupby('customer_segment')[target].mean() * 100\n",
|
|
1174
|
+
" \n",
|
|
1175
|
+
" max_rate = segment_retention.max()\n",
|
|
1176
|
+
" fig = go.Figure(go.Bar(\n",
|
|
1177
|
+
" x=segment_retention.index, y=segment_retention.values,\n",
|
|
1178
|
+
" marker_color=['#2ca02c' if r > 70 else '#ffbb00' if r > 50 else '#d62728' for r in segment_retention.values],\n",
|
|
1179
|
+
" text=[f'{r:.1f}%' for r in segment_retention.values], textposition='outside'))\n",
|
|
1180
|
+
" fig.update_layout(\n",
|
|
1181
|
+
" title='Retention Rate by Customer Segment', \n",
|
|
1182
|
+
" xaxis_title='Segment', \n",
|
|
1183
|
+
" yaxis_title='Retention Rate (%)',\n",
|
|
1184
|
+
" yaxis_range=[0, max_rate * 1.15], # Add 15% headroom for labels\n",
|
|
1185
|
+
" template='plotly_white', \n",
|
|
1186
|
+
" height=400,\n",
|
|
1187
|
+
" )\n",
|
|
1188
|
+
" display_figure(fig)\n",
|
|
1189
|
+
"\n",
|
|
1190
|
+
"segment_cols = [c for c in df_features.columns if 'segment' in c.lower() or 'bucket' in c.lower()]\n",
|
|
1191
|
+
"print(f\"\\n✓ Created {len(segment_cols)} segmentation features\")"
|
|
1192
|
+
]
|
|
1193
|
+
},
|
|
1194
|
+
{
|
|
1195
|
+
"cell_type": "markdown",
|
|
1196
|
+
"id": "59ec5a4e",
|
|
1197
|
+
"metadata": {
|
|
1198
|
+
"papermill": {
|
|
1199
|
+
"duration": 0.006712,
|
|
1200
|
+
"end_time": "2026-02-02T13:03:29.845013",
|
|
1201
|
+
"exception": false,
|
|
1202
|
+
"start_time": "2026-02-02T13:03:29.838301",
|
|
1203
|
+
"status": "completed"
|
|
1204
|
+
},
|
|
1205
|
+
"tags": []
|
|
1206
|
+
},
|
|
1207
|
+
"source": [
|
|
1208
|
+
"## 6.7 Numeric Transformation Opportunities"
|
|
1209
|
+
]
|
|
1210
|
+
},
|
|
1211
|
+
{
|
|
1212
|
+
"cell_type": "code",
|
|
1213
|
+
"execution_count": null,
|
|
1214
|
+
"id": "8ca081a4",
|
|
1215
|
+
"metadata": {
|
|
1216
|
+
"execution": {
|
|
1217
|
+
"iopub.execute_input": "2026-02-02T13:03:29.859441Z",
|
|
1218
|
+
"iopub.status.busy": "2026-02-02T13:03:29.859325Z",
|
|
1219
|
+
"iopub.status.idle": "2026-02-02T13:03:29.869333Z",
|
|
1220
|
+
"shell.execute_reply": "2026-02-02T13:03:29.868693Z"
|
|
1221
|
+
},
|
|
1222
|
+
"papermill": {
|
|
1223
|
+
"duration": 0.018433,
|
|
1224
|
+
"end_time": "2026-02-02T13:03:29.869973",
|
|
1225
|
+
"exception": false,
|
|
1226
|
+
"start_time": "2026-02-02T13:03:29.851540",
|
|
1227
|
+
"status": "completed"
|
|
1228
|
+
},
|
|
1229
|
+
"tags": []
|
|
1230
|
+
},
|
|
1231
|
+
"outputs": [],
|
|
1232
|
+
"source": [
|
|
1233
|
+
"numeric_cols = [\n",
|
|
1234
|
+
" name for name, col in findings.columns.items()\n",
|
|
1235
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
1236
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
1237
|
+
"]\n",
|
|
1238
|
+
"\n",
|
|
1239
|
+
"transform_count = 0\n",
|
|
1240
|
+
"if numeric_cols:\n",
|
|
1241
|
+
" print(\"Numeric Transformation Opportunities:\")\n",
|
|
1242
|
+
" print(\"=\"*50)\n",
|
|
1243
|
+
" \n",
|
|
1244
|
+
" for col_name in numeric_cols:\n",
|
|
1245
|
+
" col_info = findings.columns[col_name]\n",
|
|
1246
|
+
" series = df[col_name].dropna()\n",
|
|
1247
|
+
" skewness = series.skew()\n",
|
|
1248
|
+
" \n",
|
|
1249
|
+
" print(f\"\\n{col_name}:\")\n",
|
|
1250
|
+
" print(f\" Skewness: {skewness:.2f}\")\n",
|
|
1251
|
+
" \n",
|
|
1252
|
+
" if abs(skewness) > 1:\n",
|
|
1253
|
+
" print(f\" Recommendation: Apply log transform (highly skewed)\")\n",
|
|
1254
|
+
" registry.add_gold_transformation(\n",
|
|
1255
|
+
" column=col_name,\n",
|
|
1256
|
+
" transform=\"log\",\n",
|
|
1257
|
+
" parameters={\"skewness\": float(skewness), \"reason\": \"highly_skewed\"},\n",
|
|
1258
|
+
" rationale=f\"Log transform for highly skewed distribution (skewness={skewness:.2f})\",\n",
|
|
1259
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1260
|
+
" )\n",
|
|
1261
|
+
" transform_count += 1\n",
|
|
1262
|
+
" elif abs(skewness) > 0.5:\n",
|
|
1263
|
+
" print(f\" Recommendation: Consider sqrt transform (moderately skewed)\")\n",
|
|
1264
|
+
" registry.add_gold_transformation(\n",
|
|
1265
|
+
" column=col_name,\n",
|
|
1266
|
+
" transform=\"sqrt\",\n",
|
|
1267
|
+
" parameters={\"skewness\": float(skewness), \"reason\": \"moderately_skewed\"},\n",
|
|
1268
|
+
" rationale=f\"Sqrt transform for moderately skewed distribution (skewness={skewness:.2f})\",\n",
|
|
1269
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1270
|
+
" )\n",
|
|
1271
|
+
" transform_count += 1\n",
|
|
1272
|
+
" else:\n",
|
|
1273
|
+
" print(f\" Recommendation: Standard scaling sufficient\")\n",
|
|
1274
|
+
" registry.add_gold_scaling(\n",
|
|
1275
|
+
" column=col_name,\n",
|
|
1276
|
+
" method=\"standard\",\n",
|
|
1277
|
+
" rationale=f\"Standard scaling for normally distributed column (skewness={skewness:.2f})\",\n",
|
|
1278
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1279
|
+
" )\n",
|
|
1280
|
+
" transform_count += 1\n",
|
|
1281
|
+
" \n",
|
|
1282
|
+
" if col_info.inferred_type == ColumnType.NUMERIC_CONTINUOUS:\n",
|
|
1283
|
+
" print(f\" Binning: Consider creating bins for {col_name}_binned\")\n",
|
|
1284
|
+
" \n",
|
|
1285
|
+
" print(f\"\\n✅ Persisted {transform_count} transformation recommendations to registry\")"
|
|
1286
|
+
]
|
|
1287
|
+
},
|
|
1288
|
+
{
|
|
1289
|
+
"cell_type": "markdown",
|
|
1290
|
+
"id": "013eb7ce",
|
|
1291
|
+
"metadata": {
|
|
1292
|
+
"papermill": {
|
|
1293
|
+
"duration": 0.006356,
|
|
1294
|
+
"end_time": "2026-02-02T13:03:29.882881",
|
|
1295
|
+
"exception": false,
|
|
1296
|
+
"start_time": "2026-02-02T13:03:29.876525",
|
|
1297
|
+
"status": "completed"
|
|
1298
|
+
},
|
|
1299
|
+
"tags": []
|
|
1300
|
+
},
|
|
1301
|
+
"source": [
|
|
1302
|
+
"## 6.8 Categorical Encoding Opportunities"
|
|
1303
|
+
]
|
|
1304
|
+
},
|
|
1305
|
+
{
|
|
1306
|
+
"cell_type": "code",
|
|
1307
|
+
"execution_count": null,
|
|
1308
|
+
"id": "10f484f7",
|
|
1309
|
+
"metadata": {
|
|
1310
|
+
"execution": {
|
|
1311
|
+
"iopub.execute_input": "2026-02-02T13:03:29.896805Z",
|
|
1312
|
+
"iopub.status.busy": "2026-02-02T13:03:29.896685Z",
|
|
1313
|
+
"iopub.status.idle": "2026-02-02T13:03:29.901031Z",
|
|
1314
|
+
"shell.execute_reply": "2026-02-02T13:03:29.900122Z"
|
|
1315
|
+
},
|
|
1316
|
+
"papermill": {
|
|
1317
|
+
"duration": 0.01235,
|
|
1318
|
+
"end_time": "2026-02-02T13:03:29.901621",
|
|
1319
|
+
"exception": false,
|
|
1320
|
+
"start_time": "2026-02-02T13:03:29.889271",
|
|
1321
|
+
"status": "completed"
|
|
1322
|
+
},
|
|
1323
|
+
"tags": []
|
|
1324
|
+
},
|
|
1325
|
+
"outputs": [],
|
|
1326
|
+
"source": [
|
|
1327
|
+
"categorical_cols = [\n",
|
|
1328
|
+
" name for name, col in findings.columns.items()\n",
|
|
1329
|
+
" if col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]\n",
|
|
1330
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
1331
|
+
"]\n",
|
|
1332
|
+
"\n",
|
|
1333
|
+
"encoding_count = 0\n",
|
|
1334
|
+
"if categorical_cols:\n",
|
|
1335
|
+
" print(\"Categorical Encoding Recommendations:\")\n",
|
|
1336
|
+
" print(\"=\"*50)\n",
|
|
1337
|
+
" \n",
|
|
1338
|
+
" for col_name in categorical_cols:\n",
|
|
1339
|
+
" col_info = findings.columns[col_name]\n",
|
|
1340
|
+
" distinct = col_info.universal_metrics.get(\"distinct_count\", 0)\n",
|
|
1341
|
+
" \n",
|
|
1342
|
+
" print(f\"\\n{col_name}: ({distinct} unique values)\")\n",
|
|
1343
|
+
" \n",
|
|
1344
|
+
" if distinct <= 5:\n",
|
|
1345
|
+
" print(f\" Recommendation: One-hot encoding\")\n",
|
|
1346
|
+
" registry.add_gold_encoding(\n",
|
|
1347
|
+
" column=col_name,\n",
|
|
1348
|
+
" method=\"onehot\",\n",
|
|
1349
|
+
" rationale=f\"One-hot encoding for low cardinality ({distinct} unique values)\",\n",
|
|
1350
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1351
|
+
" )\n",
|
|
1352
|
+
" encoding_count += 1\n",
|
|
1353
|
+
" elif distinct <= 20:\n",
|
|
1354
|
+
" print(f\" Recommendation: Target encoding or one-hot with frequency threshold\")\n",
|
|
1355
|
+
" registry.add_gold_encoding(\n",
|
|
1356
|
+
" column=col_name,\n",
|
|
1357
|
+
" method=\"target\",\n",
|
|
1358
|
+
" rationale=f\"Target encoding for medium cardinality ({distinct} unique values)\",\n",
|
|
1359
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1360
|
+
" )\n",
|
|
1361
|
+
" encoding_count += 1\n",
|
|
1362
|
+
" else:\n",
|
|
1363
|
+
" print(f\" Recommendation: Target encoding or embedding (high cardinality)\")\n",
|
|
1364
|
+
" registry.add_gold_encoding(\n",
|
|
1365
|
+
" column=col_name,\n",
|
|
1366
|
+
" method=\"target\",\n",
|
|
1367
|
+
" rationale=f\"Target encoding for high cardinality ({distinct} unique values)\",\n",
|
|
1368
|
+
" source_notebook=\"06_feature_opportunities\"\n",
|
|
1369
|
+
" )\n",
|
|
1370
|
+
" encoding_count += 1\n",
|
|
1371
|
+
" \n",
|
|
1372
|
+
" if col_info.inferred_type == ColumnType.CATEGORICAL_ORDINAL:\n",
|
|
1373
|
+
" print(f\" Note: Consider ordinal encoding to preserve order\")\n",
|
|
1374
|
+
" \n",
|
|
1375
|
+
" print(f\"\\n✅ Persisted {encoding_count} encoding recommendations to registry\")"
|
|
1376
|
+
]
|
|
1377
|
+
},
|
|
1378
|
+
{
|
|
1379
|
+
"cell_type": "markdown",
|
|
1380
|
+
"id": "83e8707f",
|
|
1381
|
+
"metadata": {
|
|
1382
|
+
"papermill": {
|
|
1383
|
+
"duration": 0.006628,
|
|
1384
|
+
"end_time": "2026-02-02T13:03:29.915503",
|
|
1385
|
+
"exception": false,
|
|
1386
|
+
"start_time": "2026-02-02T13:03:29.908875",
|
|
1387
|
+
"status": "completed"
|
|
1388
|
+
},
|
|
1389
|
+
"tags": []
|
|
1390
|
+
},
|
|
1391
|
+
"source": [
|
|
1392
|
+
"---\n",
|
|
1393
|
+
"\n",
|
|
1394
|
+
"## Summary: What We Learned\n",
|
|
1395
|
+
"\n",
|
|
1396
|
+
"In this notebook, we identified feature engineering opportunities and analyzed data capacity:\n",
|
|
1397
|
+
"\n",
|
|
1398
|
+
"### Feature Capacity Analysis\n",
|
|
1399
|
+
"1. **Events Per Variable (EPV)** - Calculated the data's capacity to support features\n",
|
|
1400
|
+
"2. **Effective Features** - Identified redundant features due to high correlation\n",
|
|
1401
|
+
"3. **Model Complexity Guidance** - Determined appropriate model types based on data size\n",
|
|
1402
|
+
"4. **Segment Capacity** - Evaluated whether segmented modeling is viable\n",
|
|
1403
|
+
"\n",
|
|
1404
|
+
"### Feature Engineering\n",
|
|
1405
|
+
"5. **Automated Recommendations** - Framework suggested feature opportunities\n",
|
|
1406
|
+
"6. **Time-Based Features** - Created tenure, recency, active period metrics\n",
|
|
1407
|
+
"7. **Engagement Scores** - Built composite email engagement metrics\n",
|
|
1408
|
+
"8. **Customer Segments** - Created value-frequency and recency-based segments\n",
|
|
1409
|
+
"9. **Encoding Strategies** - Identified optimal encoding for each categorical\n",
|
|
1410
|
+
"\n",
|
|
1411
|
+
"## Feature Capacity Key Concepts\n",
|
|
1412
|
+
"\n",
|
|
1413
|
+
"| Metric | What It Means | Rule of Thumb |\n",
|
|
1414
|
+
"|--------|---------------|---------------|\n",
|
|
1415
|
+
"| **EPV ≥ 20** | Stable, reliable estimates | Conservative, regulatory-grade |\n",
|
|
1416
|
+
"| **EPV = 10-20** | Standard practice | Use for most applications |\n",
|
|
1417
|
+
"| **EPV = 5-10** | Limited capacity | Requires strong regularization |\n",
|
|
1418
|
+
"| **EPV < 5** | High risk | Reduce features or get more data |\n",
|
|
1419
|
+
"\n",
|
|
1420
|
+
"## Key Derived Features Created\n",
|
|
1421
|
+
"\n",
|
|
1422
|
+
"| Feature | Formula | Business Meaning |\n",
|
|
1423
|
+
"|---------|---------|-----------------|\n",
|
|
1424
|
+
"| `tenure_days` | reference_date - created | Customer longevity |\n",
|
|
1425
|
+
"| `days_since_last_order` | reference_date - lastorder | Recency/engagement |\n",
|
|
1426
|
+
"| `email_engagement_score` | 0.6×openrate + 0.4×clickrate | Overall engagement |\n",
|
|
1427
|
+
"| `service_adoption_score` | paperless + refill + doorstep | Service utilization |\n",
|
|
1428
|
+
"| `customer_segment` | Value × Frequency quadrant | Customer type |\n",
|
|
1429
|
+
"\n",
|
|
1430
|
+
"---\n",
|
|
1431
|
+
"\n",
|
|
1432
|
+
"## Next Steps\n",
|
|
1433
|
+
"\n",
|
|
1434
|
+
"Continue to **07_modeling_readiness.ipynb** to:\n",
|
|
1435
|
+
"- Validate data is ready for modeling\n",
|
|
1436
|
+
"- Check for data leakage\n",
|
|
1437
|
+
"- Assess class imbalance\n",
|
|
1438
|
+
"- Review feature completeness"
|
|
1439
|
+
]
|
|
1440
|
+
},
|
|
1441
|
+
{
|
|
1442
|
+
"cell_type": "code",
|
|
1443
|
+
"execution_count": null,
|
|
1444
|
+
"id": "10135d83",
|
|
1445
|
+
"metadata": {
|
|
1446
|
+
"execution": {
|
|
1447
|
+
"iopub.execute_input": "2026-02-02T13:03:29.929595Z",
|
|
1448
|
+
"iopub.status.busy": "2026-02-02T13:03:29.929481Z",
|
|
1449
|
+
"iopub.status.idle": "2026-02-02T13:03:29.932214Z",
|
|
1450
|
+
"shell.execute_reply": "2026-02-02T13:03:29.931785Z"
|
|
1451
|
+
},
|
|
1452
|
+
"papermill": {
|
|
1453
|
+
"duration": 0.010581,
|
|
1454
|
+
"end_time": "2026-02-02T13:03:29.932681",
|
|
1455
|
+
"exception": false,
|
|
1456
|
+
"start_time": "2026-02-02T13:03:29.922100",
|
|
1457
|
+
"status": "completed"
|
|
1458
|
+
},
|
|
1459
|
+
"tags": []
|
|
1460
|
+
},
|
|
1461
|
+
"outputs": [],
|
|
1462
|
+
"source": [
|
|
1463
|
+
"print(\"Potential Interaction Features:\")\n",
|
|
1464
|
+
"print(\"=\"*50)\n",
|
|
1465
|
+
"\n",
|
|
1466
|
+
"if len(numeric_cols) >= 2:\n",
|
|
1467
|
+
" print(\"\\nNumeric Interactions:\")\n",
|
|
1468
|
+
" for i, col1 in enumerate(numeric_cols[:3]):\n",
|
|
1469
|
+
" for col2 in numeric_cols[i+1:4]:\n",
|
|
1470
|
+
" print(f\" - {col1}_x_{col2}: Multiplication\")\n",
|
|
1471
|
+
" print(f\" - {col1}_div_{col2}: Division (if {col2} > 0)\")\n",
|
|
1472
|
+
"\n",
|
|
1473
|
+
"if categorical_cols and numeric_cols:\n",
|
|
1474
|
+
" print(\"\\nCategorical-Numeric Interactions:\")\n",
|
|
1475
|
+
" for cat_col in categorical_cols[:2]:\n",
|
|
1476
|
+
" for num_col in numeric_cols[:2]:\n",
|
|
1477
|
+
" print(f\" - {num_col}_by_{cat_col}_mean: Group mean\")\n",
|
|
1478
|
+
" print(f\" - {num_col}_by_{cat_col}_std: Group std\")"
|
|
1479
|
+
]
|
|
1480
|
+
},
|
|
1481
|
+
{
|
|
1482
|
+
"cell_type": "markdown",
|
|
1483
|
+
"id": "fd5c408b",
|
|
1484
|
+
"metadata": {
|
|
1485
|
+
"papermill": {
|
|
1486
|
+
"duration": 0.006519,
|
|
1487
|
+
"end_time": "2026-02-02T13:03:29.945809",
|
|
1488
|
+
"exception": false,
|
|
1489
|
+
"start_time": "2026-02-02T13:03:29.939290",
|
|
1490
|
+
"status": "completed"
|
|
1491
|
+
},
|
|
1492
|
+
"tags": []
|
|
1493
|
+
},
|
|
1494
|
+
"source": [
|
|
1495
|
+
"## 6.9 Feature Summary Table"
|
|
1496
|
+
]
|
|
1497
|
+
},
|
|
1498
|
+
{
|
|
1499
|
+
"cell_type": "code",
|
|
1500
|
+
"execution_count": null,
|
|
1501
|
+
"id": "91ae21da",
|
|
1502
|
+
"metadata": {
|
|
1503
|
+
"execution": {
|
|
1504
|
+
"iopub.execute_input": "2026-02-02T13:03:29.959750Z",
|
|
1505
|
+
"iopub.status.busy": "2026-02-02T13:03:29.959602Z",
|
|
1506
|
+
"iopub.status.idle": "2026-02-02T13:03:29.964357Z",
|
|
1507
|
+
"shell.execute_reply": "2026-02-02T13:03:29.963917Z"
|
|
1508
|
+
},
|
|
1509
|
+
"papermill": {
|
|
1510
|
+
"duration": 0.012698,
|
|
1511
|
+
"end_time": "2026-02-02T13:03:29.964917",
|
|
1512
|
+
"exception": false,
|
|
1513
|
+
"start_time": "2026-02-02T13:03:29.952219",
|
|
1514
|
+
"status": "completed"
|
|
1515
|
+
},
|
|
1516
|
+
"tags": []
|
|
1517
|
+
},
|
|
1518
|
+
"outputs": [],
|
|
1519
|
+
"source": [
|
|
1520
|
+
"feature_summary = []\n",
|
|
1521
|
+
"for rec in feature_recs:\n",
|
|
1522
|
+
" feature_summary.append({\n",
|
|
1523
|
+
" \"Feature Name\": rec.feature_name,\n",
|
|
1524
|
+
" \"Source\": rec.source_column,\n",
|
|
1525
|
+
" \"Type\": rec.feature_type,\n",
|
|
1526
|
+
" \"Priority\": rec.priority\n",
|
|
1527
|
+
" })\n",
|
|
1528
|
+
"\n",
|
|
1529
|
+
"if feature_summary:\n",
|
|
1530
|
+
" summary_df = pd.DataFrame(feature_summary)\n",
|
|
1531
|
+
" display(summary_df)"
|
|
1532
|
+
]
|
|
1533
|
+
},
|
|
1534
|
+
{
|
|
1535
|
+
"cell_type": "markdown",
|
|
1536
|
+
"id": "cc9d008e",
|
|
1537
|
+
"metadata": {
|
|
1538
|
+
"papermill": {
|
|
1539
|
+
"duration": 0.006911,
|
|
1540
|
+
"end_time": "2026-02-02T13:03:29.978469",
|
|
1541
|
+
"exception": false,
|
|
1542
|
+
"start_time": "2026-02-02T13:03:29.971558",
|
|
1543
|
+
"status": "completed"
|
|
1544
|
+
},
|
|
1545
|
+
"tags": []
|
|
1546
|
+
},
|
|
1547
|
+
"source": [
|
|
1548
|
+
"---\n",
|
|
1549
|
+
"\n",
|
|
1550
|
+
"## Next Steps\n",
|
|
1551
|
+
"\n",
|
|
1552
|
+
"Continue to **07_modeling_readiness.ipynb** to validate data is ready for modeling."
|
|
1553
|
+
]
|
|
1554
|
+
},
|
|
1555
|
+
{
|
|
1556
|
+
"cell_type": "code",
|
|
1557
|
+
"execution_count": null,
|
|
1558
|
+
"id": "4628808a",
|
|
1559
|
+
"metadata": {
|
|
1560
|
+
"execution": {
|
|
1561
|
+
"iopub.execute_input": "2026-02-02T13:03:29.992359Z",
|
|
1562
|
+
"iopub.status.busy": "2026-02-02T13:03:29.992067Z",
|
|
1563
|
+
"iopub.status.idle": "2026-02-02T13:03:34.680434Z",
|
|
1564
|
+
"shell.execute_reply": "2026-02-02T13:03:34.679976Z"
|
|
1565
|
+
},
|
|
1566
|
+
"papermill": {
|
|
1567
|
+
"duration": 4.696712,
|
|
1568
|
+
"end_time": "2026-02-02T13:03:34.681104",
|
|
1569
|
+
"exception": false,
|
|
1570
|
+
"start_time": "2026-02-02T13:03:29.984392",
|
|
1571
|
+
"status": "completed"
|
|
1572
|
+
},
|
|
1573
|
+
"tags": []
|
|
1574
|
+
},
|
|
1575
|
+
"outputs": [],
|
|
1576
|
+
"source": [
|
|
1577
|
+
"# Save recommendations\n",
|
|
1578
|
+
"registry.save(RECOMMENDATIONS_PATH)\n",
|
|
1579
|
+
"\n",
|
|
1580
|
+
"print(f\"✅ Saved {len(registry.all_recommendations)} recommendations to {RECOMMENDATIONS_PATH}\")\n",
|
|
1581
|
+
"print(f\"\\nRecommendations by layer:\")\n",
|
|
1582
|
+
"for layer in [\"bronze\", \"silver\", \"gold\"]:\n",
|
|
1583
|
+
" recs = registry.get_by_layer(layer)\n",
|
|
1584
|
+
" print(f\" {layer.upper()}: {len(recs)}\")\n",
|
|
1585
|
+
"\n",
|
|
1586
|
+
"from customer_retention.analysis.notebook_html_exporter import export_notebook_html\n",
|
|
1587
|
+
"export_notebook_html(Path(\"06_feature_opportunities.ipynb\"), EXPERIMENTS_DIR / \"docs\")\n"
|
|
1588
|
+
]
|
|
1589
|
+
}
|
|
1590
|
+
],
|
|
1591
|
+
"metadata": {
|
|
1592
|
+
"kernelspec": {
|
|
1593
|
+
"display_name": "Python 3",
|
|
1594
|
+
"language": "python",
|
|
1595
|
+
"name": "python3"
|
|
1596
|
+
},
|
|
1597
|
+
"language_info": {
|
|
1598
|
+
"codemirror_mode": {
|
|
1599
|
+
"name": "ipython",
|
|
1600
|
+
"version": 3
|
|
1601
|
+
},
|
|
1602
|
+
"file_extension": ".py",
|
|
1603
|
+
"mimetype": "text/x-python",
|
|
1604
|
+
"name": "python",
|
|
1605
|
+
"nbconvert_exporter": "python",
|
|
1606
|
+
"pygments_lexer": "ipython3",
|
|
1607
|
+
"version": "3.12.4"
|
|
1608
|
+
},
|
|
1609
|
+
"papermill": {
|
|
1610
|
+
"default_parameters": {},
|
|
1611
|
+
"duration": 8.626387,
|
|
1612
|
+
"end_time": "2026-02-02T13:03:35.105815",
|
|
1613
|
+
"environment_variables": {},
|
|
1614
|
+
"exception": null,
|
|
1615
|
+
"input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/06_feature_opportunities.ipynb",
|
|
1616
|
+
"output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/06_feature_opportunities.ipynb",
|
|
1617
|
+
"parameters": {},
|
|
1618
|
+
"start_time": "2026-02-02T13:03:26.479428",
|
|
1619
|
+
"version": "2.6.0"
|
|
1620
|
+
}
|
|
1621
|
+
},
|
|
1622
|
+
"nbformat": 4,
|
|
1623
|
+
"nbformat_minor": 5
|
|
1624
|
+
}
|