churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,1457 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "18a81db7",
|
|
6
|
+
"metadata": {
|
|
7
|
+
"papermill": {
|
|
8
|
+
"duration": 0.003496,
|
|
9
|
+
"end_time": "2026-02-02T13:03:21.294742",
|
|
10
|
+
"exception": false,
|
|
11
|
+
"start_time": "2026-02-02T13:03:21.291246",
|
|
12
|
+
"status": "completed"
|
|
13
|
+
},
|
|
14
|
+
"tags": []
|
|
15
|
+
},
|
|
16
|
+
"source": [
|
|
17
|
+
"# Chapter 5: Multi-Dataset Relationships\n",
|
|
18
|
+
"\n",
|
|
19
|
+
"**Purpose:** Combine multiple explored datasets, define relationships, and plan feature aggregations before feature engineering.\n",
|
|
20
|
+
"\n",
|
|
21
|
+
"**When to use this notebook:**\n",
|
|
22
|
+
"- You have explored multiple datasets using notebooks 01-04 (or 01a-01d)\n",
|
|
23
|
+
"- Your datasets share common keys (e.g., customer_id)\n",
|
|
24
|
+
"- You want to create features from event-level data to join with entity-level data\n",
|
|
25
|
+
"\n",
|
|
26
|
+
"**What you'll learn:**\n",
|
|
27
|
+
"- How to discover and manage multiple exploration findings\n",
|
|
28
|
+
"- How to detect and define relationships between datasets\n",
|
|
29
|
+
"- How to plan time-window aggregations for event datasets\n",
|
|
30
|
+
"- How to preview the feature set before engineering\n",
|
|
31
|
+
"\n",
|
|
32
|
+
"**Outputs:**\n",
|
|
33
|
+
"- Multi-dataset findings file (YAML)\n",
|
|
34
|
+
"- Relationship definitions\n",
|
|
35
|
+
"- Aggregation plan\n",
|
|
36
|
+
"\n",
|
|
37
|
+
"---\n",
|
|
38
|
+
"\n",
|
|
39
|
+
"## Multi-Dataset Architecture\n",
|
|
40
|
+
"\n",
|
|
41
|
+
"```\n",
|
|
42
|
+
"+------------------+ +-------------------+ +------------------+\n",
|
|
43
|
+
"| Entity Dataset | | Event Dataset 1 | | Event Dataset 2 |\n",
|
|
44
|
+
"| (customers.csv) | | (transactions.csv)| | (emails.csv) |\n",
|
|
45
|
+
"| | | | | |\n",
|
|
46
|
+
"| - customer_id |<---->| - customer_id |<---->| - customer_id |\n",
|
|
47
|
+
"| - churned (Y) | | - transaction_date| | - sent_date |\n",
|
|
48
|
+
"| - city | | - amount | | - opened |\n",
|
|
49
|
+
"+------------------+ +-------------------+ +------------------+\n",
|
|
50
|
+
" | | |\n",
|
|
51
|
+
" v v v\n",
|
|
52
|
+
" Primary Table Aggregate to: Aggregate to:\n",
|
|
53
|
+
" (one row per - amount_sum_7d - email_count_30d\n",
|
|
54
|
+
" customer) - txn_count_30d - open_rate_90d\n",
|
|
55
|
+
"```"
|
|
56
|
+
]
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"cell_type": "markdown",
|
|
60
|
+
"id": "915afefc",
|
|
61
|
+
"metadata": {
|
|
62
|
+
"papermill": {
|
|
63
|
+
"duration": 0.002093,
|
|
64
|
+
"end_time": "2026-02-02T13:03:21.299543",
|
|
65
|
+
"exception": false,
|
|
66
|
+
"start_time": "2026-02-02T13:03:21.297450",
|
|
67
|
+
"status": "completed"
|
|
68
|
+
},
|
|
69
|
+
"tags": []
|
|
70
|
+
},
|
|
71
|
+
"source": [
|
|
72
|
+
"## 5.1 Setup and Discover Datasets"
|
|
73
|
+
]
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"cell_type": "code",
|
|
77
|
+
"execution_count": null,
|
|
78
|
+
"id": "c7d77a44",
|
|
79
|
+
"metadata": {
|
|
80
|
+
"execution": {
|
|
81
|
+
"iopub.execute_input": "2026-02-02T13:03:21.304757Z",
|
|
82
|
+
"iopub.status.busy": "2026-02-02T13:03:21.304630Z",
|
|
83
|
+
"iopub.status.idle": "2026-02-02T13:03:23.135487Z",
|
|
84
|
+
"shell.execute_reply": "2026-02-02T13:03:23.134869Z"
|
|
85
|
+
},
|
|
86
|
+
"papermill": {
|
|
87
|
+
"duration": 1.835817,
|
|
88
|
+
"end_time": "2026-02-02T13:03:23.137563",
|
|
89
|
+
"exception": false,
|
|
90
|
+
"start_time": "2026-02-02T13:03:21.301746",
|
|
91
|
+
"status": "completed"
|
|
92
|
+
},
|
|
93
|
+
"tags": []
|
|
94
|
+
},
|
|
95
|
+
"outputs": [],
|
|
96
|
+
"source": [
|
|
97
|
+
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
98
|
+
"track_and_export_previous(\"05_multi_dataset.ipynb\")\n",
|
|
99
|
+
"\n",
|
|
100
|
+
"from customer_retention.analysis.auto_explorer import (\n",
|
|
101
|
+
" ExplorationManager,\n",
|
|
102
|
+
" MultiDatasetFindings,\n",
|
|
103
|
+
" ExplorationFindings,\n",
|
|
104
|
+
" RecommendationRegistry,\n",
|
|
105
|
+
")\n",
|
|
106
|
+
"from customer_retention.stages.profiling import (\n",
|
|
107
|
+
" RelationshipDetector,\n",
|
|
108
|
+
" TimeWindowAggregator,\n",
|
|
109
|
+
" RelationshipType,\n",
|
|
110
|
+
" SegmentAnalyzer,\n",
|
|
111
|
+
" SegmentationMethod,\n",
|
|
112
|
+
" FeatureCapacityAnalyzer,\n",
|
|
113
|
+
" TemporalFeatureEngineer,\n",
|
|
114
|
+
" TemporalAggregationConfig,\n",
|
|
115
|
+
" ReferenceMode,\n",
|
|
116
|
+
" FeatureGroup,\n",
|
|
117
|
+
" DimensionReductionMethod,\n",
|
|
118
|
+
")\n",
|
|
119
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
120
|
+
"from customer_retention.core.config.column_config import DatasetGranularity, ColumnType\n",
|
|
121
|
+
"from pathlib import Path\n",
|
|
122
|
+
"import yaml\n",
|
|
123
|
+
"import pandas as pd\n",
|
|
124
|
+
"import numpy as np\n",
|
|
125
|
+
"import plotly.graph_objects as go\n",
|
|
126
|
+
"from plotly.subplots import make_subplots\n",
|
|
127
|
+
"from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
|
|
128
|
+
]
|
|
129
|
+
},
|
|
130
|
+
{
|
|
131
|
+
"cell_type": "code",
|
|
132
|
+
"execution_count": null,
|
|
133
|
+
"id": "eda144a0",
|
|
134
|
+
"metadata": {
|
|
135
|
+
"execution": {
|
|
136
|
+
"iopub.execute_input": "2026-02-02T13:03:23.143847Z",
|
|
137
|
+
"iopub.status.busy": "2026-02-02T13:03:23.143614Z",
|
|
138
|
+
"iopub.status.idle": "2026-02-02T13:03:23.149654Z",
|
|
139
|
+
"shell.execute_reply": "2026-02-02T13:03:23.148371Z"
|
|
140
|
+
},
|
|
141
|
+
"papermill": {
|
|
142
|
+
"duration": 0.010775,
|
|
143
|
+
"end_time": "2026-02-02T13:03:23.150723",
|
|
144
|
+
"exception": false,
|
|
145
|
+
"start_time": "2026-02-02T13:03:23.139948",
|
|
146
|
+
"status": "completed"
|
|
147
|
+
},
|
|
148
|
+
"tags": []
|
|
149
|
+
},
|
|
150
|
+
"outputs": [],
|
|
151
|
+
"source": [
|
|
152
|
+
"# === CONFIGURATION ===\n",
|
|
153
|
+
"# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
|
|
154
|
+
"\n",
|
|
155
|
+
"# Initialize the exploration manager\n",
|
|
156
|
+
"manager = ExplorationManager(explorations_dir=FINDINGS_DIR)\n",
|
|
157
|
+
"\n",
|
|
158
|
+
"# Discover all explored datasets (prefers aggregated over event-level when both exist)\n",
|
|
159
|
+
"findings_files = manager.discover_findings(prefer_aggregated=True)\n",
|
|
160
|
+
"skipped_files = manager.get_skipped_event_findings()\n",
|
|
161
|
+
"\n",
|
|
162
|
+
"print(f\"Found {len(findings_files)} explored dataset(s):\\n\")\n",
|
|
163
|
+
"for f in findings_files:\n",
|
|
164
|
+
" print(f\" - {f.name}\")\n",
|
|
165
|
+
"\n",
|
|
166
|
+
"if skipped_files:\n",
|
|
167
|
+
" print(f\"\\n⏭️ Skipped {len(skipped_files)} event-level findings (using aggregated versions instead):\")\n",
|
|
168
|
+
" for f in skipped_files:\n",
|
|
169
|
+
" print(f\" - {f.name}\")\n",
|
|
170
|
+
"\n",
|
|
171
|
+
"# Load or initialize recommendations registry\n",
|
|
172
|
+
"RECOMMENDATIONS_PATH = FINDINGS_DIR / \"recommendations.yaml\"\n",
|
|
173
|
+
"if RECOMMENDATIONS_PATH.exists():\n",
|
|
174
|
+
" with open(RECOMMENDATIONS_PATH, \"r\") as f:\n",
|
|
175
|
+
" registry = RecommendationRegistry.from_dict(yaml.safe_load(f))\n",
|
|
176
|
+
" print(f\"\\nLoaded existing recommendations: {len(registry.all_recommendations)} total\")\n",
|
|
177
|
+
"else:\n",
|
|
178
|
+
" registry = RecommendationRegistry()\n",
|
|
179
|
+
" print(\"\\nInitialized new recommendation registry\")"
|
|
180
|
+
]
|
|
181
|
+
},
|
|
182
|
+
{
|
|
183
|
+
"cell_type": "code",
|
|
184
|
+
"execution_count": null,
|
|
185
|
+
"id": "a4d325cd",
|
|
186
|
+
"metadata": {
|
|
187
|
+
"execution": {
|
|
188
|
+
"iopub.execute_input": "2026-02-02T13:03:23.173572Z",
|
|
189
|
+
"iopub.status.busy": "2026-02-02T13:03:23.173134Z",
|
|
190
|
+
"iopub.status.idle": "2026-02-02T13:03:23.392956Z",
|
|
191
|
+
"shell.execute_reply": "2026-02-02T13:03:23.392264Z"
|
|
192
|
+
},
|
|
193
|
+
"papermill": {
|
|
194
|
+
"duration": 0.230614,
|
|
195
|
+
"end_time": "2026-02-02T13:03:23.393602",
|
|
196
|
+
"exception": false,
|
|
197
|
+
"start_time": "2026-02-02T13:03:23.162988",
|
|
198
|
+
"status": "completed"
|
|
199
|
+
},
|
|
200
|
+
"tags": []
|
|
201
|
+
},
|
|
202
|
+
"outputs": [],
|
|
203
|
+
"source": [
|
|
204
|
+
"# List datasets with details\n",
|
|
205
|
+
"datasets = manager.list_datasets()\n",
|
|
206
|
+
"\n",
|
|
207
|
+
"print(\"\\n\" + \"=\"*70)\n",
|
|
208
|
+
"print(\"DISCOVERED DATASETS\")\n",
|
|
209
|
+
"print(\"=\"*70 + \"\\n\")\n",
|
|
210
|
+
"\n",
|
|
211
|
+
"for ds in datasets:\n",
|
|
212
|
+
" granularity_emoji = \"\\U0001f4ca\" if ds.granularity == DatasetGranularity.ENTITY_LEVEL else \"\\U0001f4c8\"\n",
|
|
213
|
+
" target_info = f\" [TARGET: {ds.target_column}]\" if ds.target_column else \"\"\n",
|
|
214
|
+
" \n",
|
|
215
|
+
" print(f\"{granularity_emoji} {ds.name}\")\n",
|
|
216
|
+
" print(f\" Granularity: {ds.granularity.value}\")\n",
|
|
217
|
+
" print(f\" Rows: {ds.row_count:,} | Columns: {ds.column_count}\")\n",
|
|
218
|
+
" if ds.entity_column:\n",
|
|
219
|
+
" print(f\" Entity: {ds.entity_column} | Time: {ds.time_column}\")\n",
|
|
220
|
+
" print(f\" Source: {ds.source_path}{target_info}\")\n",
|
|
221
|
+
" print()"
|
|
222
|
+
]
|
|
223
|
+
},
|
|
224
|
+
{
|
|
225
|
+
"cell_type": "markdown",
|
|
226
|
+
"id": "23f1c0fd",
|
|
227
|
+
"metadata": {
|
|
228
|
+
"papermill": {
|
|
229
|
+
"duration": 0.002275,
|
|
230
|
+
"end_time": "2026-02-02T13:03:23.398411",
|
|
231
|
+
"exception": false,
|
|
232
|
+
"start_time": "2026-02-02T13:03:23.396136",
|
|
233
|
+
"status": "completed"
|
|
234
|
+
},
|
|
235
|
+
"tags": []
|
|
236
|
+
},
|
|
237
|
+
"source": [
|
|
238
|
+
"## 5.2 Multi-Dataset Dashboard"
|
|
239
|
+
]
|
|
240
|
+
},
|
|
241
|
+
{
|
|
242
|
+
"cell_type": "code",
|
|
243
|
+
"execution_count": null,
|
|
244
|
+
"id": "465a97e7",
|
|
245
|
+
"metadata": {
|
|
246
|
+
"execution": {
|
|
247
|
+
"iopub.execute_input": "2026-02-02T13:03:23.403897Z",
|
|
248
|
+
"iopub.status.busy": "2026-02-02T13:03:23.403766Z",
|
|
249
|
+
"iopub.status.idle": "2026-02-02T13:03:23.601533Z",
|
|
250
|
+
"shell.execute_reply": "2026-02-02T13:03:23.600794Z"
|
|
251
|
+
},
|
|
252
|
+
"papermill": {
|
|
253
|
+
"duration": 0.201877,
|
|
254
|
+
"end_time": "2026-02-02T13:03:23.602549",
|
|
255
|
+
"exception": false,
|
|
256
|
+
"start_time": "2026-02-02T13:03:23.400672",
|
|
257
|
+
"status": "completed"
|
|
258
|
+
},
|
|
259
|
+
"tags": []
|
|
260
|
+
},
|
|
261
|
+
"outputs": [],
|
|
262
|
+
"source": [
|
|
263
|
+
"# Create multi-dataset findings and visual dashboard\n",
|
|
264
|
+
"multi = manager.create_multi_dataset_findings()\n",
|
|
265
|
+
"\n",
|
|
266
|
+
"if len(datasets) > 0:\n",
|
|
267
|
+
" # Prepare data for visualization\n",
|
|
268
|
+
" names = [ds.name for ds in datasets]\n",
|
|
269
|
+
" rows = [ds.row_count for ds in datasets]\n",
|
|
270
|
+
" cols = [ds.column_count for ds in datasets]\n",
|
|
271
|
+
" granularities = [\"Entity\" if ds.granularity == DatasetGranularity.ENTITY_LEVEL else \"Event\" \n",
|
|
272
|
+
" for ds in datasets]\n",
|
|
273
|
+
" colors = [\"#2ecc71\" if ds.granularity == DatasetGranularity.ENTITY_LEVEL else \"#3498db\"\n",
|
|
274
|
+
" for ds in datasets]\n",
|
|
275
|
+
"\n",
|
|
276
|
+
" # Create dashboard with metrics column + horizontal bar charts\n",
|
|
277
|
+
" fig = make_subplots(\n",
|
|
278
|
+
" rows=2, cols=2,\n",
|
|
279
|
+
" column_widths=[0.35, 0.65],\n",
|
|
280
|
+
" row_heights=[0.5, 0.5],\n",
|
|
281
|
+
" specs=[[{\"type\": \"xy\", \"rowspan\": 2}, {\"type\": \"bar\"}],\n",
|
|
282
|
+
" [None, {\"type\": \"bar\"}]],\n",
|
|
283
|
+
" subplot_titles=(\"\", \"Row Counts by Dataset\", \"Column Counts by Dataset\"),\n",
|
|
284
|
+
" horizontal_spacing=0.12,\n",
|
|
285
|
+
" vertical_spacing=0.15\n",
|
|
286
|
+
" )\n",
|
|
287
|
+
"\n",
|
|
288
|
+
" # Left panel: invisible placeholder for metrics\n",
|
|
289
|
+
" fig.add_trace(go.Scatter(x=[0], y=[0], mode='markers', marker=dict(opacity=0), showlegend=False), row=1, col=1)\n",
|
|
290
|
+
"\n",
|
|
291
|
+
" # Top right: Horizontal bar chart for row counts (names readable on y-axis)\n",
|
|
292
|
+
" fig.add_trace(\n",
|
|
293
|
+
" go.Bar(y=names, x=rows, orientation='h', marker_color=colors, name=\"Rows\",\n",
|
|
294
|
+
" text=[f\"{r:,}\" for r in rows], textposition=\"auto\",\n",
|
|
295
|
+
" hovertemplate=\"%{y}: %{x:,} rows<extra></extra>\"),\n",
|
|
296
|
+
" row=1, col=2\n",
|
|
297
|
+
" )\n",
|
|
298
|
+
"\n",
|
|
299
|
+
" # Bottom right: Horizontal bar chart for column counts\n",
|
|
300
|
+
" fig.add_trace(\n",
|
|
301
|
+
" go.Bar(y=names, x=cols, orientation='h', marker_color=colors, name=\"Columns\",\n",
|
|
302
|
+
" text=cols, textposition=\"auto\",\n",
|
|
303
|
+
" hovertemplate=\"%{y}: %{x} columns<extra></extra>\"),\n",
|
|
304
|
+
" row=2, col=2\n",
|
|
305
|
+
" )\n",
|
|
306
|
+
"\n",
|
|
307
|
+
" # Build metrics text for left panel (expandable list format)\n",
|
|
308
|
+
" annotations = []\n",
|
|
309
|
+
" y_pos = 0.98\n",
|
|
310
|
+
"\n",
|
|
311
|
+
" # Total Datasets (label + value)\n",
|
|
312
|
+
" annotations.append(dict(x=0.01, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
313
|
+
" text=\"<b>Total Datasets</b>\", showarrow=False, font=dict(size=11, color=\"#666\"), xanchor=\"left\"))\n",
|
|
314
|
+
" y_pos -= 0.06\n",
|
|
315
|
+
" annotations.append(dict(x=0.01, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
316
|
+
" text=f\"<b>{len(multi.datasets)}</b>\", showarrow=False, font=dict(size=18, color=\"#2c3e50\"), xanchor=\"left\"))\n",
|
|
317
|
+
" y_pos -= 0.10\n",
|
|
318
|
+
"\n",
|
|
319
|
+
" # Primary Entity Dataset\n",
|
|
320
|
+
" primary_name = multi.primary_entity_dataset or \"Not detected\"\n",
|
|
321
|
+
" primary_color = \"#27ae60\" if multi.primary_entity_dataset else \"#e74c3c\"\n",
|
|
322
|
+
" annotations.append(dict(x=0.01, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
323
|
+
" text=\"<b>Primary Entity</b>\", showarrow=False, font=dict(size=11, color=\"#666\"), xanchor=\"left\"))\n",
|
|
324
|
+
" y_pos -= 0.06\n",
|
|
325
|
+
" annotations.append(dict(x=0.01, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
326
|
+
" text=f\"<span style='color:{primary_color}'>{primary_name}</span>\", \n",
|
|
327
|
+
" showarrow=False, font=dict(size=12), xanchor=\"left\"))\n",
|
|
328
|
+
" y_pos -= 0.10\n",
|
|
329
|
+
"\n",
|
|
330
|
+
" # Event Datasets (expandable list - one per row)\n",
|
|
331
|
+
" annotations.append(dict(x=0.01, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
332
|
+
" text=\"<b>Event Datasets</b>\", showarrow=False, font=dict(size=11, color=\"#666\"), xanchor=\"left\"))\n",
|
|
333
|
+
" y_pos -= 0.06\n",
|
|
334
|
+
" \n",
|
|
335
|
+
" if multi.event_datasets:\n",
|
|
336
|
+
" # Show each event dataset on its own line (supports 20+ datasets)\n",
|
|
337
|
+
" max_display = min(len(multi.event_datasets), 8) # Show up to 8, then summarize\n",
|
|
338
|
+
" for i, event_name in enumerate(multi.event_datasets[:max_display]):\n",
|
|
339
|
+
" annotations.append(dict(x=0.03, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
340
|
+
" text=f\"• {event_name}\", showarrow=False, font=dict(size=10, color=\"#3498db\"), xanchor=\"left\"))\n",
|
|
341
|
+
" y_pos -= 0.045\n",
|
|
342
|
+
" \n",
|
|
343
|
+
" if len(multi.event_datasets) > max_display:\n",
|
|
344
|
+
" remaining = len(multi.event_datasets) - max_display\n",
|
|
345
|
+
" annotations.append(dict(x=0.03, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
346
|
+
" text=f\"... +{remaining} more\", showarrow=False, font=dict(size=10, color=\"#888\"), xanchor=\"left\"))\n",
|
|
347
|
+
" y_pos -= 0.045\n",
|
|
348
|
+
" else:\n",
|
|
349
|
+
" annotations.append(dict(x=0.03, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
350
|
+
" text=\"None\", showarrow=False, font=dict(size=10, color=\"#888\"), xanchor=\"left\"))\n",
|
|
351
|
+
" y_pos -= 0.045\n",
|
|
352
|
+
"\n",
|
|
353
|
+
" # Aggregation Windows at bottom\n",
|
|
354
|
+
" y_pos = max(y_pos - 0.05, 0.02)\n",
|
|
355
|
+
" windows_str = \", \".join(multi.aggregation_windows[:4])\n",
|
|
356
|
+
" if len(multi.aggregation_windows) > 4:\n",
|
|
357
|
+
" windows_str += \"...\"\n",
|
|
358
|
+
" annotations.append(dict(x=0.01, y=y_pos, xref=\"paper\", yref=\"paper\",\n",
|
|
359
|
+
" text=f\"<b>Windows:</b> {windows_str}\", showarrow=False, font=dict(size=9, color=\"#888\"), xanchor=\"left\"))\n",
|
|
360
|
+
"\n",
|
|
361
|
+
" fig.update_layout(\n",
|
|
362
|
+
" title=\"Multi-Dataset Overview\",\n",
|
|
363
|
+
" height=500,\n",
|
|
364
|
+
" showlegend=False,\n",
|
|
365
|
+
" template=\"plotly_white\",\n",
|
|
366
|
+
" annotations=annotations\n",
|
|
367
|
+
" )\n",
|
|
368
|
+
"\n",
|
|
369
|
+
" # Hide axes on left panel\n",
|
|
370
|
+
" fig.update_xaxes(visible=False, row=1, col=1)\n",
|
|
371
|
+
" fig.update_yaxes(visible=False, row=1, col=1)\n",
|
|
372
|
+
" \n",
|
|
373
|
+
" # Configure horizontal bar axes\n",
|
|
374
|
+
" fig.update_yaxes(categoryorder='total ascending', row=1, col=2)\n",
|
|
375
|
+
" fig.update_yaxes(categoryorder='total ascending', row=2, col=2)\n",
|
|
376
|
+
"\n",
|
|
377
|
+
" display_figure(fig)\n",
|
|
378
|
+
"\n",
|
|
379
|
+
" # Legend for colors\n",
|
|
380
|
+
" print(\"Legend: 🟢 Entity-level (one row per entity) 🔵 Event-level (multiple rows per entity)\")\n",
|
|
381
|
+
"else:\n",
|
|
382
|
+
" print(\"No datasets found. Run notebooks 01a-01d first to explore your data.\")"
|
|
383
|
+
]
|
|
384
|
+
},
|
|
385
|
+
{
|
|
386
|
+
"cell_type": "markdown",
|
|
387
|
+
"id": "60022f5a",
|
|
388
|
+
"metadata": {
|
|
389
|
+
"papermill": {
|
|
390
|
+
"duration": 0.004026,
|
|
391
|
+
"end_time": "2026-02-02T13:03:23.611009",
|
|
392
|
+
"exception": false,
|
|
393
|
+
"start_time": "2026-02-02T13:03:23.606983",
|
|
394
|
+
"status": "completed"
|
|
395
|
+
},
|
|
396
|
+
"tags": []
|
|
397
|
+
},
|
|
398
|
+
"source": [
|
|
399
|
+
"## 5.3 Dataset Selection (Optional Override)\n",
|
|
400
|
+
"\n",
|
|
401
|
+
"By default, all discovered datasets are included. To analyze only specific datasets, provide their names below."
|
|
402
|
+
]
|
|
403
|
+
},
|
|
404
|
+
{
|
|
405
|
+
"cell_type": "code",
|
|
406
|
+
"execution_count": null,
|
|
407
|
+
"id": "68d408f6",
|
|
408
|
+
"metadata": {
|
|
409
|
+
"execution": {
|
|
410
|
+
"iopub.execute_input": "2026-02-02T13:03:23.619976Z",
|
|
411
|
+
"iopub.status.busy": "2026-02-02T13:03:23.619857Z",
|
|
412
|
+
"iopub.status.idle": "2026-02-02T13:03:23.622891Z",
|
|
413
|
+
"shell.execute_reply": "2026-02-02T13:03:23.622459Z"
|
|
414
|
+
},
|
|
415
|
+
"papermill": {
|
|
416
|
+
"duration": 0.008084,
|
|
417
|
+
"end_time": "2026-02-02T13:03:23.623501",
|
|
418
|
+
"exception": false,
|
|
419
|
+
"start_time": "2026-02-02T13:03:23.615417",
|
|
420
|
+
"status": "completed"
|
|
421
|
+
},
|
|
422
|
+
"tags": []
|
|
423
|
+
},
|
|
424
|
+
"outputs": [],
|
|
425
|
+
"source": [
|
|
426
|
+
"# === DATASET SELECTION (Optional) ===\n",
|
|
427
|
+
"# Set to None to use all discovered datasets (default)\n",
|
|
428
|
+
"# Or provide a list of dataset names to include only those\n",
|
|
429
|
+
"DATASET_NAMES = ['customer_retention_retail'] # e.g., [\"customers\", \"transactions\", \"emails\"]\n",
|
|
430
|
+
"\n",
|
|
431
|
+
"if DATASET_NAMES:\n",
|
|
432
|
+
" # Filter to only specified datasets\n",
|
|
433
|
+
" available_names = [ds.name for ds in datasets]\n",
|
|
434
|
+
" valid_names = [name for name in DATASET_NAMES if name in available_names]\n",
|
|
435
|
+
" invalid_names = [name for name in DATASET_NAMES if name not in available_names]\n",
|
|
436
|
+
"\n",
|
|
437
|
+
" if invalid_names:\n",
|
|
438
|
+
" print(f\"⚠️ Datasets not found: {invalid_names}\")\n",
|
|
439
|
+
" print(f\" Available: {available_names}\")\n",
|
|
440
|
+
"\n",
|
|
441
|
+
" if valid_names:\n",
|
|
442
|
+
" # Recreate multi-dataset findings with only selected datasets\n",
|
|
443
|
+
" multi = manager.create_multi_dataset_findings(dataset_names=valid_names)\n",
|
|
444
|
+
" print(f\"✓ Using {len(valid_names)} selected dataset(s): {valid_names}\")\n",
|
|
445
|
+
" else:\n",
|
|
446
|
+
" print(\"⚠️ No valid datasets specified. Using all discovered datasets.\")\n",
|
|
447
|
+
"else:\n",
|
|
448
|
+
" print(f\"Using all {len(datasets)} discovered dataset(s)\")"
|
|
449
|
+
]
|
|
450
|
+
},
|
|
451
|
+
{
|
|
452
|
+
"cell_type": "markdown",
|
|
453
|
+
"id": "9af4a83a",
|
|
454
|
+
"metadata": {
|
|
455
|
+
"papermill": {
|
|
456
|
+
"duration": 0.005501,
|
|
457
|
+
"end_time": "2026-02-02T13:03:23.633770",
|
|
458
|
+
"exception": false,
|
|
459
|
+
"start_time": "2026-02-02T13:03:23.628269",
|
|
460
|
+
"status": "completed"
|
|
461
|
+
},
|
|
462
|
+
"tags": []
|
|
463
|
+
},
|
|
464
|
+
"source": [
|
|
465
|
+
"## 5.4 Define Relationships Between Datasets\n",
|
|
466
|
+
"\n",
|
|
467
|
+
"Relationships define how datasets connect. For each event dataset, specify:\n",
|
|
468
|
+
"- Which entity dataset it relates to\n",
|
|
469
|
+
"- Which columns form the join key\n",
|
|
470
|
+
"- The relationship type (one-to-many for event data)"
|
|
471
|
+
]
|
|
472
|
+
},
|
|
473
|
+
{
|
|
474
|
+
"cell_type": "code",
|
|
475
|
+
"execution_count": null,
|
|
476
|
+
"id": "231f9f79",
|
|
477
|
+
"metadata": {
|
|
478
|
+
"execution": {
|
|
479
|
+
"iopub.execute_input": "2026-02-02T13:03:23.643403Z",
|
|
480
|
+
"iopub.status.busy": "2026-02-02T13:03:23.643275Z",
|
|
481
|
+
"iopub.status.idle": "2026-02-02T13:03:23.646721Z",
|
|
482
|
+
"shell.execute_reply": "2026-02-02T13:03:23.646004Z"
|
|
483
|
+
},
|
|
484
|
+
"papermill": {
|
|
485
|
+
"duration": 0.009505,
|
|
486
|
+
"end_time": "2026-02-02T13:03:23.647552",
|
|
487
|
+
"exception": false,
|
|
488
|
+
"start_time": "2026-02-02T13:03:23.638047",
|
|
489
|
+
"status": "completed"
|
|
490
|
+
},
|
|
491
|
+
"tags": []
|
|
492
|
+
},
|
|
493
|
+
"outputs": [],
|
|
494
|
+
"source": [
|
|
495
|
+
"# Try to auto-detect relationships using sample data\n",
|
|
496
|
+
"print(\"\\n\" + \"=\"*70)\n",
|
|
497
|
+
"print(\"RELATIONSHIP DETECTION\")\n",
|
|
498
|
+
"print(\"=\"*70 + \"\\n\")\n",
|
|
499
|
+
"\n",
|
|
500
|
+
"detector = RelationshipDetector()\n",
|
|
501
|
+
"\n",
|
|
502
|
+
"# If we have a primary entity dataset and event datasets, try to detect relationships\n",
|
|
503
|
+
"if multi.primary_entity_dataset and multi.event_datasets:\n",
|
|
504
|
+
" primary_info = multi.datasets[multi.primary_entity_dataset]\n",
|
|
505
|
+
" \n",
|
|
506
|
+
" print(f\"Primary dataset: {multi.primary_entity_dataset}\")\n",
|
|
507
|
+
" print(f\"Checking relationships with event datasets...\\n\")\n",
|
|
508
|
+
" \n",
|
|
509
|
+
" for event_name in multi.event_datasets:\n",
|
|
510
|
+
" event_info = multi.datasets[event_name]\n",
|
|
511
|
+
" \n",
|
|
512
|
+
" # Check if they share common column names\n",
|
|
513
|
+
" if event_info.entity_column:\n",
|
|
514
|
+
" print(f\"\\U0001f517 {multi.primary_entity_dataset} <-> {event_name}\")\n",
|
|
515
|
+
" print(f\" Potential join column: {event_info.entity_column}\")\n",
|
|
516
|
+
" print(f\" Expected relationship: one_to_many\")\n",
|
|
517
|
+
" print()\n",
|
|
518
|
+
"else:\n",
|
|
519
|
+
" print(\"Not enough datasets to detect relationships.\")\n",
|
|
520
|
+
" print(\"Need at least one entity-level and one event-level dataset.\")"
|
|
521
|
+
]
|
|
522
|
+
},
|
|
523
|
+
{
|
|
524
|
+
"cell_type": "code",
|
|
525
|
+
"execution_count": null,
|
|
526
|
+
"id": "f739bec7",
|
|
527
|
+
"metadata": {
|
|
528
|
+
"execution": {
|
|
529
|
+
"iopub.execute_input": "2026-02-02T13:03:23.656470Z",
|
|
530
|
+
"iopub.status.busy": "2026-02-02T13:03:23.656355Z",
|
|
531
|
+
"iopub.status.idle": "2026-02-02T13:03:23.659921Z",
|
|
532
|
+
"shell.execute_reply": "2026-02-02T13:03:23.659317Z"
|
|
533
|
+
},
|
|
534
|
+
"papermill": {
|
|
535
|
+
"duration": 0.009088,
|
|
536
|
+
"end_time": "2026-02-02T13:03:23.660820",
|
|
537
|
+
"exception": false,
|
|
538
|
+
"start_time": "2026-02-02T13:03:23.651732",
|
|
539
|
+
"status": "completed"
|
|
540
|
+
},
|
|
541
|
+
"tags": []
|
|
542
|
+
},
|
|
543
|
+
"outputs": [],
|
|
544
|
+
"source": [
|
|
545
|
+
"# === MANUAL RELATIONSHIP DEFINITION ===\n",
|
|
546
|
+
"# Define relationships between your datasets\n",
|
|
547
|
+
"# Uncomment and modify as needed\n",
|
|
548
|
+
"\n",
|
|
549
|
+
"# Example: Link transactions to customers\n",
|
|
550
|
+
"# multi.add_relationship(\n",
|
|
551
|
+
"# left_dataset=\"customers\",\n",
|
|
552
|
+
"# right_dataset=\"transactions\",\n",
|
|
553
|
+
"# left_column=\"customer_id\",\n",
|
|
554
|
+
"# right_column=\"customer_id\",\n",
|
|
555
|
+
"# relationship_type=\"one_to_many\"\n",
|
|
556
|
+
"# )\n",
|
|
557
|
+
"\n",
|
|
558
|
+
"# Example: Link emails to customers\n",
|
|
559
|
+
"# multi.add_relationship(\n",
|
|
560
|
+
"# left_dataset=\"customers\",\n",
|
|
561
|
+
"# right_dataset=\"emails\",\n",
|
|
562
|
+
"# left_column=\"customer_id\",\n",
|
|
563
|
+
"# right_column=\"customer_id\",\n",
|
|
564
|
+
"# relationship_type=\"one_to_many\"\n",
|
|
565
|
+
"# )\n",
|
|
566
|
+
"\n",
|
|
567
|
+
"print(f\"Defined relationships: {len(multi.relationships)}\")\n",
|
|
568
|
+
"for rel in multi.relationships:\n",
|
|
569
|
+
" print(f\" {rel.left_dataset}.{rel.left_column} -> {rel.right_dataset}.{rel.right_column} ({rel.relationship_type})\")\n",
|
|
570
|
+
"\n",
|
|
571
|
+
"# Initialize silver layer if not already done\n",
|
|
572
|
+
"if registry.silver is None:\n",
|
|
573
|
+
" entity_col = multi.datasets[multi.primary_entity_dataset].entity_column if multi.primary_entity_dataset else \"entity_id\"\n",
|
|
574
|
+
" registry.init_silver(entity_col)\n",
|
|
575
|
+
"\n",
|
|
576
|
+
"# Persist join recommendations to registry\n",
|
|
577
|
+
"for rel in multi.relationships:\n",
|
|
578
|
+
" registry.add_silver_join(\n",
|
|
579
|
+
" left_source=rel.left_dataset,\n",
|
|
580
|
+
" right_source=rel.right_dataset,\n",
|
|
581
|
+
" join_keys=[rel.left_column],\n",
|
|
582
|
+
" join_type=rel.relationship_type,\n",
|
|
583
|
+
" rationale=f\"Join {rel.left_dataset} with {rel.right_dataset} on {rel.left_column}\",\n",
|
|
584
|
+
" source_notebook=\"05_multi_dataset\"\n",
|
|
585
|
+
" )\n",
|
|
586
|
+
"\n",
|
|
587
|
+
"if multi.relationships:\n",
|
|
588
|
+
" print(f\"\\n✅ Persisted {len(multi.relationships)} join recommendations to registry\")"
|
|
589
|
+
]
|
|
590
|
+
},
|
|
591
|
+
{
|
|
592
|
+
"cell_type": "markdown",
|
|
593
|
+
"id": "ad203984",
|
|
594
|
+
"metadata": {
|
|
595
|
+
"papermill": {
|
|
596
|
+
"duration": 0.004387,
|
|
597
|
+
"end_time": "2026-02-02T13:03:23.671779",
|
|
598
|
+
"exception": false,
|
|
599
|
+
"start_time": "2026-02-02T13:03:23.667392",
|
|
600
|
+
"status": "completed"
|
|
601
|
+
},
|
|
602
|
+
"tags": []
|
|
603
|
+
},
|
|
604
|
+
"source": [
|
|
605
|
+
"## 5.5 Plan Temporal Feature Engineering\n",
|
|
606
|
+
"\n",
|
|
607
|
+
"For event datasets, we engineer sophisticated temporal features using **per-customer alignment**:\n",
|
|
608
|
+
"- Each customer's features are computed relative to their reference date (churn date or last activity)\n",
|
|
609
|
+
"- This makes historical churners comparable to current active customers\n",
|
|
610
|
+
"\n",
|
|
611
|
+
"**Feature Groups Available:**\n",
|
|
612
|
+
"\n",
|
|
613
|
+
"| Group | Features | Purpose |\n",
|
|
614
|
+
"|-------|----------|---------|\n",
|
|
615
|
+
"| **Lagged Windows** | `lag0_{metric}_{agg}`, `lag1_{metric}_{agg}`, ... | Sequential non-overlapping time windows |\n",
|
|
616
|
+
"| **Velocity** | `{metric}_velocity`, `{metric}_velocity_pct` | Rate of change between windows |\n",
|
|
617
|
+
"| **Acceleration** | `{metric}_acceleration`, `{metric}_momentum` | Change in velocity, weighted direction |\n",
|
|
618
|
+
"| **Lifecycle** | `{metric}_beginning`, `{metric}_middle`, `{metric}_end` | Beginning/middle/end of customer history |\n",
|
|
619
|
+
"| **Recency** | `days_since_last_event`, `active_span_days` | How recently customer was active |\n",
|
|
620
|
+
"| **Regularity** | `event_frequency`, `regularity_score` | Consistency of engagement |\n",
|
|
621
|
+
"| **Cohort Comparison** | `{metric}_cohort_zscore` | Customer vs peer group |"
|
|
622
|
+
]
|
|
623
|
+
},
|
|
624
|
+
{
|
|
625
|
+
"cell_type": "code",
|
|
626
|
+
"execution_count": null,
|
|
627
|
+
"id": "1d005aff",
|
|
628
|
+
"metadata": {
|
|
629
|
+
"execution": {
|
|
630
|
+
"iopub.execute_input": "2026-02-02T13:03:23.683740Z",
|
|
631
|
+
"iopub.status.busy": "2026-02-02T13:03:23.683600Z",
|
|
632
|
+
"iopub.status.idle": "2026-02-02T13:03:23.687775Z",
|
|
633
|
+
"shell.execute_reply": "2026-02-02T13:03:23.687218Z"
|
|
634
|
+
},
|
|
635
|
+
"papermill": {
|
|
636
|
+
"duration": 0.012099,
|
|
637
|
+
"end_time": "2026-02-02T13:03:23.688972",
|
|
638
|
+
"exception": false,
|
|
639
|
+
"start_time": "2026-02-02T13:03:23.676873",
|
|
640
|
+
"status": "completed"
|
|
641
|
+
},
|
|
642
|
+
"tags": []
|
|
643
|
+
},
|
|
644
|
+
"outputs": [],
|
|
645
|
+
"source": [
|
|
646
|
+
"# Get aggregation plan for event datasets\n",
|
|
647
|
+
"agg_plan = multi.get_aggregation_plan()\n",
|
|
648
|
+
"\n",
|
|
649
|
+
"print(\"\\n\" + \"=\"*70)\n",
|
|
650
|
+
"print(\"AGGREGATION PLAN\")\n",
|
|
651
|
+
"print(\"=\"*70 + \"\\n\")\n",
|
|
652
|
+
"\n",
|
|
653
|
+
"if agg_plan:\n",
|
|
654
|
+
" for dataset_name, plan in agg_plan.items():\n",
|
|
655
|
+
" print(f\"\\U0001f4ca {dataset_name}\")\n",
|
|
656
|
+
" print(f\" Entity column: {plan.entity_column}\")\n",
|
|
657
|
+
" print(f\" Time column: {plan.time_column}\")\n",
|
|
658
|
+
" print(f\" Windows: {plan.windows}\")\n",
|
|
659
|
+
" print(f\" Default agg funcs: {plan.agg_funcs}\")\n",
|
|
660
|
+
" print()\n",
|
|
661
|
+
"else:\n",
|
|
662
|
+
" print(\"No event datasets to aggregate.\")\n",
|
|
663
|
+
"\n",
|
|
664
|
+
"# Show available feature groups\n",
|
|
665
|
+
"print(\"\\n\" + \"=\"*70)\n",
|
|
666
|
+
"print(\"TEMPORAL FEATURE GROUPS\")\n",
|
|
667
|
+
"print(\"=\"*70 + \"\\n\")\n",
|
|
668
|
+
"\n",
|
|
669
|
+
"for group in FeatureGroup:\n",
|
|
670
|
+
" enabled = \"✓\" if group in [FeatureGroup.LAGGED_WINDOWS, FeatureGroup.VELOCITY, \n",
|
|
671
|
+
" FeatureGroup.RECENCY, FeatureGroup.REGULARITY] else \"○\"\n",
|
|
672
|
+
" print(f\" {enabled} {group.value}\")"
|
|
673
|
+
]
|
|
674
|
+
},
|
|
675
|
+
{
|
|
676
|
+
"cell_type": "code",
|
|
677
|
+
"execution_count": null,
|
|
678
|
+
"id": "89dee3cc",
|
|
679
|
+
"metadata": {
|
|
680
|
+
"execution": {
|
|
681
|
+
"iopub.execute_input": "2026-02-02T13:03:23.701821Z",
|
|
682
|
+
"iopub.status.busy": "2026-02-02T13:03:23.701603Z",
|
|
683
|
+
"iopub.status.idle": "2026-02-02T13:03:23.707747Z",
|
|
684
|
+
"shell.execute_reply": "2026-02-02T13:03:23.706719Z"
|
|
685
|
+
},
|
|
686
|
+
"papermill": {
|
|
687
|
+
"duration": 0.01362,
|
|
688
|
+
"end_time": "2026-02-02T13:03:23.708473",
|
|
689
|
+
"exception": false,
|
|
690
|
+
"start_time": "2026-02-02T13:03:23.694853",
|
|
691
|
+
"status": "completed"
|
|
692
|
+
},
|
|
693
|
+
"tags": []
|
|
694
|
+
},
|
|
695
|
+
"outputs": [],
|
|
696
|
+
"source": [
|
|
697
|
+
"# === CONFIGURE TEMPORAL FEATURES ===\n",
|
|
698
|
+
"\n",
|
|
699
|
+
"# Reference mode: PER_CUSTOMER aligns to each customer's reference date\n",
|
|
700
|
+
"# This is critical for churn models where customers churned at different times\n",
|
|
701
|
+
"REFERENCE_MODE = ReferenceMode.PER_CUSTOMER\n",
|
|
702
|
+
"\n",
|
|
703
|
+
"# Lagged window configuration\n",
|
|
704
|
+
"LAG_WINDOW_DAYS = 30 # Each lag window spans this many days\n",
|
|
705
|
+
"NUM_LAGS = 4 # Number of sequential windows (lag0, lag1, lag2, lag3)\n",
|
|
706
|
+
"LAG_AGGREGATIONS = [\"sum\", \"mean\", \"count\", \"max\"] # Aggregations per window\n",
|
|
707
|
+
"\n",
|
|
708
|
+
"# Feature groups to compute\n",
|
|
709
|
+
"FEATURE_GROUPS = [\n",
|
|
710
|
+
" FeatureGroup.LAGGED_WINDOWS, # lag0_amount_sum, lag1_amount_sum, ...\n",
|
|
711
|
+
" FeatureGroup.VELOCITY, # amount_velocity (rate of change)\n",
|
|
712
|
+
" FeatureGroup.ACCELERATION, # amount_acceleration, amount_momentum\n",
|
|
713
|
+
" FeatureGroup.LIFECYCLE, # amount_beginning, amount_middle, amount_end\n",
|
|
714
|
+
" FeatureGroup.RECENCY, # days_since_last_event, active_span_days\n",
|
|
715
|
+
" FeatureGroup.REGULARITY, # event_frequency, regularity_score\n",
|
|
716
|
+
" FeatureGroup.COHORT_COMPARISON, # amount_cohort_zscore\n",
|
|
717
|
+
"]\n",
|
|
718
|
+
"\n",
|
|
719
|
+
"# Lifecycle configuration\n",
|
|
720
|
+
"MIN_HISTORY_DAYS = 60 # Minimum days of history for lifecycle features (else NaN)\n",
|
|
721
|
+
"\n",
|
|
722
|
+
"# Create configuration\n",
|
|
723
|
+
"temporal_config = TemporalAggregationConfig(\n",
|
|
724
|
+
" reference_mode=REFERENCE_MODE,\n",
|
|
725
|
+
" lag_window_days=LAG_WINDOW_DAYS,\n",
|
|
726
|
+
" num_lags=NUM_LAGS,\n",
|
|
727
|
+
" lag_aggregations=LAG_AGGREGATIONS,\n",
|
|
728
|
+
" compute_velocity=FeatureGroup.VELOCITY in FEATURE_GROUPS,\n",
|
|
729
|
+
" compute_acceleration=FeatureGroup.ACCELERATION in FEATURE_GROUPS,\n",
|
|
730
|
+
" compute_lifecycle=FeatureGroup.LIFECYCLE in FEATURE_GROUPS,\n",
|
|
731
|
+
" min_history_days=MIN_HISTORY_DAYS,\n",
|
|
732
|
+
" compute_recency=FeatureGroup.RECENCY in FEATURE_GROUPS,\n",
|
|
733
|
+
" compute_regularity=FeatureGroup.REGULARITY in FEATURE_GROUPS,\n",
|
|
734
|
+
" compute_cohort=FeatureGroup.COHORT_COMPARISON in FEATURE_GROUPS,\n",
|
|
735
|
+
")\n",
|
|
736
|
+
"\n",
|
|
737
|
+
"# Store in multi-dataset findings\n",
|
|
738
|
+
"multi.notes['temporal_config'] = {\n",
|
|
739
|
+
" 'reference_mode': REFERENCE_MODE.value,\n",
|
|
740
|
+
" 'lag_window_days': LAG_WINDOW_DAYS,\n",
|
|
741
|
+
" 'num_lags': NUM_LAGS,\n",
|
|
742
|
+
" 'lag_aggregations': LAG_AGGREGATIONS,\n",
|
|
743
|
+
" 'feature_groups': [g.value for g in FEATURE_GROUPS],\n",
|
|
744
|
+
" 'min_history_days': MIN_HISTORY_DAYS,\n",
|
|
745
|
+
"}\n",
|
|
746
|
+
"\n",
|
|
747
|
+
"# Persist temporal configuration to registry for each event dataset\n",
|
|
748
|
+
"for dataset_name in multi.event_datasets:\n",
|
|
749
|
+
" ds_info = multi.datasets[dataset_name]\n",
|
|
750
|
+
" findings = manager.load_findings(dataset_name)\n",
|
|
751
|
+
" \n",
|
|
752
|
+
" if findings:\n",
|
|
753
|
+
" numeric_cols = [\n",
|
|
754
|
+
" name for name, col in findings.columns.items()\n",
|
|
755
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
756
|
+
" and name not in [ds_info.entity_column, ds_info.time_column] and name not in TEMPORAL_METADATA_COLS\n",
|
|
757
|
+
" ]\n",
|
|
758
|
+
" \n",
|
|
759
|
+
" if numeric_cols:\n",
|
|
760
|
+
" registry.add_silver_temporal_config(\n",
|
|
761
|
+
" source_dataset=dataset_name,\n",
|
|
762
|
+
" columns=numeric_cols,\n",
|
|
763
|
+
" lag_windows=NUM_LAGS,\n",
|
|
764
|
+
" lag_window_days=LAG_WINDOW_DAYS,\n",
|
|
765
|
+
" aggregations=LAG_AGGREGATIONS,\n",
|
|
766
|
+
" feature_groups=[g.value for g in FEATURE_GROUPS],\n",
|
|
767
|
+
" rationale=f\"Temporal features for {dataset_name} with {len(numeric_cols)} columns\",\n",
|
|
768
|
+
" source_notebook=\"05_multi_dataset\"\n",
|
|
769
|
+
" )\n",
|
|
770
|
+
"\n",
|
|
771
|
+
"print(\"Temporal Feature Configuration:\")\n",
|
|
772
|
+
"print(f\" Reference Mode: {REFERENCE_MODE.value}\")\n",
|
|
773
|
+
"print(f\" Lag Windows: {NUM_LAGS} x {LAG_WINDOW_DAYS} days\")\n",
|
|
774
|
+
"print(f\" Aggregations: {LAG_AGGREGATIONS}\")\n",
|
|
775
|
+
"print(f\" Feature Groups: {len(FEATURE_GROUPS)} enabled\")\n",
|
|
776
|
+
"print()\n",
|
|
777
|
+
"print(\"\\U0001f4a1 With per-customer alignment, all customers are measured from their reference point.\")\n",
|
|
778
|
+
"print(\" - Churned customers: reference = churn date\")\n",
|
|
779
|
+
"print(\" - Active customers: reference = last activity or analysis date\")\n",
|
|
780
|
+
"\n",
|
|
781
|
+
"if multi.event_datasets:\n",
|
|
782
|
+
" print(f\"\\n✅ Persisted temporal config for {len(multi.event_datasets)} event dataset(s)\")"
|
|
783
|
+
]
|
|
784
|
+
},
|
|
785
|
+
{
|
|
786
|
+
"cell_type": "markdown",
|
|
787
|
+
"id": "4c87b0ec",
|
|
788
|
+
"metadata": {
|
|
789
|
+
"papermill": {
|
|
790
|
+
"duration": 0.00386,
|
|
791
|
+
"end_time": "2026-02-02T13:03:23.716688",
|
|
792
|
+
"exception": false,
|
|
793
|
+
"start_time": "2026-02-02T13:03:23.712828",
|
|
794
|
+
"status": "completed"
|
|
795
|
+
},
|
|
796
|
+
"tags": []
|
|
797
|
+
},
|
|
798
|
+
"source": [
|
|
799
|
+
"## 5.6 Preview Feature Set\n",
|
|
800
|
+
"\n",
|
|
801
|
+
"Preview the features that will be created from time-window aggregations."
|
|
802
|
+
]
|
|
803
|
+
},
|
|
804
|
+
{
|
|
805
|
+
"cell_type": "code",
|
|
806
|
+
"execution_count": null,
|
|
807
|
+
"id": "aa9ce4ca",
|
|
808
|
+
"metadata": {
|
|
809
|
+
"execution": {
|
|
810
|
+
"iopub.execute_input": "2026-02-02T13:03:23.726130Z",
|
|
811
|
+
"iopub.status.busy": "2026-02-02T13:03:23.725975Z",
|
|
812
|
+
"iopub.status.idle": "2026-02-02T13:03:23.732128Z",
|
|
813
|
+
"shell.execute_reply": "2026-02-02T13:03:23.731518Z"
|
|
814
|
+
},
|
|
815
|
+
"papermill": {
|
|
816
|
+
"duration": 0.011605,
|
|
817
|
+
"end_time": "2026-02-02T13:03:23.732641",
|
|
818
|
+
"exception": false,
|
|
819
|
+
"start_time": "2026-02-02T13:03:23.721036",
|
|
820
|
+
"status": "completed"
|
|
821
|
+
},
|
|
822
|
+
"tags": []
|
|
823
|
+
},
|
|
824
|
+
"outputs": [],
|
|
825
|
+
"source": [
|
|
826
|
+
"# For each event dataset, preview what features could be created\n",
|
|
827
|
+
"print(\"\\n\" + \"=\"*70)\n",
|
|
828
|
+
"print(\"TEMPORAL FEATURES PREVIEW\")\n",
|
|
829
|
+
"print(\"=\"*70 + \"\\n\")\n",
|
|
830
|
+
"\n",
|
|
831
|
+
"for dataset_name in multi.event_datasets:\n",
|
|
832
|
+
" ds_info = multi.datasets[dataset_name]\n",
|
|
833
|
+
" \n",
|
|
834
|
+
" print(f\"\\U0001f4c8 From {dataset_name}:\")\n",
|
|
835
|
+
" print()\n",
|
|
836
|
+
" \n",
|
|
837
|
+
" # Load findings to see numeric columns\n",
|
|
838
|
+
" findings = manager.load_findings(dataset_name)\n",
|
|
839
|
+
" \n",
|
|
840
|
+
" # Find numeric columns that could be aggregated\n",
|
|
841
|
+
" numeric_cols = []\n",
|
|
842
|
+
" if findings:\n",
|
|
843
|
+
" numeric_cols = [\n",
|
|
844
|
+
" name for name, col in findings.columns.items()\n",
|
|
845
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
846
|
+
" and name not in [ds_info.entity_column, ds_info.time_column] and name not in TEMPORAL_METADATA_COLS\n",
|
|
847
|
+
" ]\n",
|
|
848
|
+
" \n",
|
|
849
|
+
" # Group 1: Lagged Window Features\n",
|
|
850
|
+
" if FeatureGroup.LAGGED_WINDOWS in FEATURE_GROUPS:\n",
|
|
851
|
+
" print(\" 📊 LAGGED WINDOWS (Group 1):\")\n",
|
|
852
|
+
" for col in numeric_cols[:2]:\n",
|
|
853
|
+
" features = [f\"lag{i}_{col}_{agg}\" for i in range(NUM_LAGS) for agg in LAG_AGGREGATIONS[:2]]\n",
|
|
854
|
+
" print(f\" {col}: {features[:4]}...\")\n",
|
|
855
|
+
" print(f\" Total: {len(numeric_cols)} cols × {NUM_LAGS} lags × {len(LAG_AGGREGATIONS)} aggs\")\n",
|
|
856
|
+
" \n",
|
|
857
|
+
" # Group 2: Velocity Features\n",
|
|
858
|
+
" if FeatureGroup.VELOCITY in FEATURE_GROUPS:\n",
|
|
859
|
+
" print(\"\\n 🚀 VELOCITY (Group 2):\")\n",
|
|
860
|
+
" for col in numeric_cols[:2]:\n",
|
|
861
|
+
" print(f\" - {col}_velocity, {col}_velocity_pct\")\n",
|
|
862
|
+
" print(f\" Total: {len(numeric_cols)} cols × 2 features\")\n",
|
|
863
|
+
" \n",
|
|
864
|
+
" # Group 3: Acceleration Features\n",
|
|
865
|
+
" if FeatureGroup.ACCELERATION in FEATURE_GROUPS:\n",
|
|
866
|
+
" print(\"\\n ⚡ ACCELERATION (Group 3):\")\n",
|
|
867
|
+
" for col in numeric_cols[:2]:\n",
|
|
868
|
+
" print(f\" - {col}_acceleration, {col}_momentum\")\n",
|
|
869
|
+
" print(f\" Total: {len(numeric_cols)} cols × 2 features\")\n",
|
|
870
|
+
" \n",
|
|
871
|
+
" # Group 4: Lifecycle Features\n",
|
|
872
|
+
" if FeatureGroup.LIFECYCLE in FEATURE_GROUPS:\n",
|
|
873
|
+
" print(\"\\n 📈 LIFECYCLE (Group 4):\")\n",
|
|
874
|
+
" for col in numeric_cols[:2]:\n",
|
|
875
|
+
" print(f\" - {col}_beginning, {col}_middle, {col}_end, {col}_trend_ratio\")\n",
|
|
876
|
+
" print(f\" Total: {len(numeric_cols)} cols × 4 features\")\n",
|
|
877
|
+
" print(f\" ℹ️ Requires {MIN_HISTORY_DAYS}+ days of history (else NaN)\")\n",
|
|
878
|
+
" \n",
|
|
879
|
+
" # Group 5: Recency Features\n",
|
|
880
|
+
" if FeatureGroup.RECENCY in FEATURE_GROUPS:\n",
|
|
881
|
+
" print(\"\\n ⏱️ RECENCY (Group 5):\")\n",
|
|
882
|
+
" print(\" - days_since_last_event\")\n",
|
|
883
|
+
" print(\" - days_since_first_event\")\n",
|
|
884
|
+
" print(\" - active_span_days\")\n",
|
|
885
|
+
" print(\" - recency_ratio\")\n",
|
|
886
|
+
" \n",
|
|
887
|
+
" # Group 6: Regularity Features\n",
|
|
888
|
+
" if FeatureGroup.REGULARITY in FEATURE_GROUPS:\n",
|
|
889
|
+
" print(\"\\n 🎯 REGULARITY (Group 6):\")\n",
|
|
890
|
+
" print(\" - event_frequency\")\n",
|
|
891
|
+
" print(\" - inter_event_gap_mean\")\n",
|
|
892
|
+
" print(\" - inter_event_gap_std\")\n",
|
|
893
|
+
" print(\" - regularity_score\")\n",
|
|
894
|
+
" \n",
|
|
895
|
+
" # Group 7: Cohort Comparison\n",
|
|
896
|
+
" if FeatureGroup.COHORT_COMPARISON in FEATURE_GROUPS:\n",
|
|
897
|
+
" print(\"\\n 👥 COHORT COMPARISON (Group 7):\")\n",
|
|
898
|
+
" for col in numeric_cols[:2]:\n",
|
|
899
|
+
" print(f\" - {col}_vs_cohort_mean, {col}_vs_cohort_pct, {col}_cohort_zscore\")\n",
|
|
900
|
+
" print(f\" Total: {len(numeric_cols)} cols × 3 features\")\n",
|
|
901
|
+
" \n",
|
|
902
|
+
" # Summary\n",
|
|
903
|
+
" total_features = 0\n",
|
|
904
|
+
" if FeatureGroup.LAGGED_WINDOWS in FEATURE_GROUPS:\n",
|
|
905
|
+
" total_features += len(numeric_cols) * NUM_LAGS * len(LAG_AGGREGATIONS)\n",
|
|
906
|
+
" if FeatureGroup.VELOCITY in FEATURE_GROUPS:\n",
|
|
907
|
+
" total_features += len(numeric_cols) * 2\n",
|
|
908
|
+
" if FeatureGroup.ACCELERATION in FEATURE_GROUPS:\n",
|
|
909
|
+
" total_features += len(numeric_cols) * 2\n",
|
|
910
|
+
" if FeatureGroup.LIFECYCLE in FEATURE_GROUPS:\n",
|
|
911
|
+
" total_features += len(numeric_cols) * 4\n",
|
|
912
|
+
" if FeatureGroup.RECENCY in FEATURE_GROUPS:\n",
|
|
913
|
+
" total_features += 4\n",
|
|
914
|
+
" if FeatureGroup.REGULARITY in FEATURE_GROUPS:\n",
|
|
915
|
+
" total_features += 4\n",
|
|
916
|
+
" if FeatureGroup.COHORT_COMPARISON in FEATURE_GROUPS:\n",
|
|
917
|
+
" total_features += len(numeric_cols) * 3\n",
|
|
918
|
+
" \n",
|
|
919
|
+
" print(f\"\\n 📝 TOTAL ESTIMATED FEATURES: ~{total_features}\")\n",
|
|
920
|
+
" print()"
|
|
921
|
+
]
|
|
922
|
+
},
|
|
923
|
+
{
|
|
924
|
+
"cell_type": "markdown",
|
|
925
|
+
"id": "1e8b9248",
|
|
926
|
+
"metadata": {
|
|
927
|
+
"papermill": {
|
|
928
|
+
"duration": 0.004047,
|
|
929
|
+
"end_time": "2026-02-02T13:03:23.741097",
|
|
930
|
+
"exception": false,
|
|
931
|
+
"start_time": "2026-02-02T13:03:23.737050",
|
|
932
|
+
"status": "completed"
|
|
933
|
+
},
|
|
934
|
+
"tags": []
|
|
935
|
+
},
|
|
936
|
+
"source": [
|
|
937
|
+
"## 5.7 Segmentation Analysis\n",
|
|
938
|
+
"\n",
|
|
939
|
+
"Should we build **separate models per customer segment** or a **single unified model**? This analysis provides evidence-based metrics to make that decision.\n",
|
|
940
|
+
"\n",
|
|
941
|
+
"**Key Decision Metrics:**\n",
|
|
942
|
+
"\n",
|
|
943
|
+
"| Metric | What It Measures | Good Value |\n",
|
|
944
|
+
"|--------|------------------|------------|\n",
|
|
945
|
+
"| **Silhouette Score** | Cluster cohesion (how tight) & separation (how different) | > 0.25 |\n",
|
|
946
|
+
"| **Target Variance** | How much target rates differ across segments | > 0.15 |\n",
|
|
947
|
+
"| **Segment Balance** | Size distribution across segments | > 0.3 ratio |\n",
|
|
948
|
+
"| **EPV per Segment** | Events-per-variable for reliable modeling | > 10 |\n",
|
|
949
|
+
"\n",
|
|
950
|
+
"**Interpretation Guide:**\n",
|
|
951
|
+
"- **Silhouette > 0.5**: Strong natural clustering - segments are distinct\n",
|
|
952
|
+
"- **Silhouette 0.25-0.5**: Reasonable structure - segments somewhat distinct \n",
|
|
953
|
+
"- **Silhouette < 0.25**: Weak structure - data is relatively homogeneous\n",
|
|
954
|
+
"- **Silhouette < 0**: Overlapping clusters - segmentation not supported"
|
|
955
|
+
]
|
|
956
|
+
},
|
|
957
|
+
{
|
|
958
|
+
"cell_type": "code",
|
|
959
|
+
"execution_count": null,
|
|
960
|
+
"id": "2f826435",
|
|
961
|
+
"metadata": {
|
|
962
|
+
"execution": {
|
|
963
|
+
"iopub.execute_input": "2026-02-02T13:03:23.750855Z",
|
|
964
|
+
"iopub.status.busy": "2026-02-02T13:03:23.750732Z",
|
|
965
|
+
"iopub.status.idle": "2026-02-02T13:03:23.763154Z",
|
|
966
|
+
"shell.execute_reply": "2026-02-02T13:03:23.762680Z"
|
|
967
|
+
},
|
|
968
|
+
"papermill": {
|
|
969
|
+
"duration": 0.017748,
|
|
970
|
+
"end_time": "2026-02-02T13:03:23.763611",
|
|
971
|
+
"exception": false,
|
|
972
|
+
"start_time": "2026-02-02T13:03:23.745863",
|
|
973
|
+
"status": "completed"
|
|
974
|
+
},
|
|
975
|
+
"tags": []
|
|
976
|
+
},
|
|
977
|
+
"outputs": [],
|
|
978
|
+
"source": [
|
|
979
|
+
"# Segmentation Analysis on Primary Entity Dataset\n",
|
|
980
|
+
"from customer_retention.stages.temporal import load_data_with_snapshot_preference\n",
|
|
981
|
+
"\n",
|
|
982
|
+
"print(\"=\" * 70)\n",
|
|
983
|
+
"print(\"SEGMENTATION ANALYSIS\")\n",
|
|
984
|
+
"print(\"=\" * 70)\n",
|
|
985
|
+
"\n",
|
|
986
|
+
"segment_analyzer = SegmentAnalyzer()\n",
|
|
987
|
+
"capacity_analyzer = FeatureCapacityAnalyzer()\n",
|
|
988
|
+
"\n",
|
|
989
|
+
"# Consistent color palette: Segment 0=blue, 1=red, 2=green, 3=purple, etc.\n",
|
|
990
|
+
"SEGMENT_COLORS = {\n",
|
|
991
|
+
" 0: '#3498db', # Blue\n",
|
|
992
|
+
" 1: '#e74c3c', # Red\n",
|
|
993
|
+
" 2: '#2ecc71', # Green\n",
|
|
994
|
+
" 3: '#9b59b6', # Purple\n",
|
|
995
|
+
" 4: '#f39c12', # Orange\n",
|
|
996
|
+
" 5: '#1abc9c', # Teal\n",
|
|
997
|
+
" 6: '#e67e22', # Dark Orange\n",
|
|
998
|
+
"}\n",
|
|
999
|
+
"\n",
|
|
1000
|
+
"if multi.primary_entity_dataset:\n",
|
|
1001
|
+
" primary_info = multi.datasets[multi.primary_entity_dataset]\n",
|
|
1002
|
+
" primary_findings = manager.load_findings(multi.primary_entity_dataset)\n",
|
|
1003
|
+
" \n",
|
|
1004
|
+
" if primary_findings:\n",
|
|
1005
|
+
" # Load the primary dataset from snapshot (not source) to get correct column names\n",
|
|
1006
|
+
" primary_df, data_source = load_data_with_snapshot_preference(primary_findings, output_dir=str(FINDINGS_DIR))\n",
|
|
1007
|
+
" print(f\" Loaded from: {data_source}\")\n",
|
|
1008
|
+
" \n",
|
|
1009
|
+
" # Get numeric features for clustering (exclude temporal metadata)\n",
|
|
1010
|
+
" from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS\n",
|
|
1011
|
+
" numeric_features = [\n",
|
|
1012
|
+
" name for name, col in primary_findings.columns.items()\n",
|
|
1013
|
+
" if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
|
|
1014
|
+
" and name != primary_info.target_column\n",
|
|
1015
|
+
" and name not in TEMPORAL_METADATA_COLS\n",
|
|
1016
|
+
" ]\n",
|
|
1017
|
+
" \n",
|
|
1018
|
+
" print(f\"\\n📊 Dataset: {multi.primary_entity_dataset}\")\n",
|
|
1019
|
+
" print(f\" Total Samples: {len(primary_df):,}\")\n",
|
|
1020
|
+
" print(f\" Numeric Features: {len(numeric_features)}\")\n",
|
|
1021
|
+
" print(f\" Target Column: {primary_info.target_column}\")\n",
|
|
1022
|
+
" \n",
|
|
1023
|
+
" # Run full segmentation analysis using framework\n",
|
|
1024
|
+
" analysis = segment_analyzer.run_full_analysis(\n",
|
|
1025
|
+
" primary_df,\n",
|
|
1026
|
+
" feature_cols=numeric_features,\n",
|
|
1027
|
+
" target_col=primary_info.target_column,\n",
|
|
1028
|
+
" max_segments=5,\n",
|
|
1029
|
+
" dim_reduction=DimensionReductionMethod.PCA,\n",
|
|
1030
|
+
" )\n",
|
|
1031
|
+
" m = analysis.metrics # Shorthand for metrics\n",
|
|
1032
|
+
" \n",
|
|
1033
|
+
" # ============================================================\n",
|
|
1034
|
+
" # KEY DECISION METRICS\n",
|
|
1035
|
+
" # ============================================================\n",
|
|
1036
|
+
" print(\"\\n\" + \"=\" * 70)\n",
|
|
1037
|
+
" print(\"📊 CLUSTERING DECISION METRICS\")\n",
|
|
1038
|
+
" print(\"=\" * 70)\n",
|
|
1039
|
+
" \n",
|
|
1040
|
+
" print(f\"\"\"\n",
|
|
1041
|
+
"┌─────────────────────────────────────────────────────────────────────┐\n",
|
|
1042
|
+
"│ METRIC │ VALUE │ INTERPRETATION │\n",
|
|
1043
|
+
"├──────────────────────────────────┼─────────────┼────────────────────┤\n",
|
|
1044
|
+
"│ Silhouette Score (cohesion) │ {m.silhouette_score:+.3f} │ {m.silhouette_interpretation:<18} │\n",
|
|
1045
|
+
"│ Target Rate Variance │ {f'{m.target_variance_ratio:.3f}' if m.target_variance_ratio else 'N/A':>11} │ {m.target_variance_interpretation:<18} │\n",
|
|
1046
|
+
"│ Optimal Segments Found │ {m.n_segments} │ {m.segments_interpretation:<18} │\n",
|
|
1047
|
+
"│ Overall Confidence │ {m.confidence:.0%} │ {m.confidence_interpretation:<18} │\n",
|
|
1048
|
+
"└──────────────────────────────────┴─────────────┴────────────────────┘\"\"\")\n",
|
|
1049
|
+
" \n",
|
|
1050
|
+
" print(f\"\\n🎯 RECOMMENDATION: {m.recommendation.upper().replace('_', ' ')}\")\n",
|
|
1051
|
+
" print(f\"\\n📋 Supporting Evidence:\")\n",
|
|
1052
|
+
" for r in m.rationale:\n",
|
|
1053
|
+
" print(f\" • {r}\")\n",
|
|
1054
|
+
" \n",
|
|
1055
|
+
" # ============================================================\n",
|
|
1056
|
+
" # SEGMENT PROFILES\n",
|
|
1057
|
+
" # ============================================================\n",
|
|
1058
|
+
" print(\"\\n\" + \"=\" * 70)\n",
|
|
1059
|
+
" print(\"📊 SEGMENT PROFILES\")\n",
|
|
1060
|
+
" print(\"=\" * 70 + \"\\n\")\n",
|
|
1061
|
+
" \n",
|
|
1062
|
+
" segment_data = [{\n",
|
|
1063
|
+
" \"Segment\": f\"Segment {p.segment_id}\",\n",
|
|
1064
|
+
" \"N (count)\": f\"{p.size:,}\",\n",
|
|
1065
|
+
" \"% of Total\": f\"{p.size_pct:.1f}%\",\n",
|
|
1066
|
+
" \"Target Rate\": f\"{p.target_rate:.1%}\" if p.target_rate is not None else \"N/A\",\n",
|
|
1067
|
+
" \"Viable for ML\": \"✓\" if p.size >= 100 else \"⚠️\"\n",
|
|
1068
|
+
" } for p in analysis.profiles]\n",
|
|
1069
|
+
" display(pd.DataFrame(segment_data))\n",
|
|
1070
|
+
" \n",
|
|
1071
|
+
" sd = analysis.size_distribution\n",
|
|
1072
|
+
" print(f\"\\n📊 Size Distribution:\")\n",
|
|
1073
|
+
" print(f\" Total datapoints: {sd['total']:,}\")\n",
|
|
1074
|
+
" print(f\" Smallest segment: {sd['min_size']:,} ({sd['min_pct']:.1f}%)\")\n",
|
|
1075
|
+
" print(f\" Largest segment: {sd['max_size']:,} ({sd['max_pct']:.1f}%)\")\n",
|
|
1076
|
+
" print(f\" Balance ratio: {sd['balance_ratio']:.2f} (1.0 = perfectly balanced)\")\n",
|
|
1077
|
+
" \n",
|
|
1078
|
+
" # ============================================================\n",
|
|
1079
|
+
" # CLUSTER VISUALIZATION\n",
|
|
1080
|
+
" # ============================================================\n",
|
|
1081
|
+
" if analysis.has_visualization:\n",
|
|
1082
|
+
" viz = analysis.visualization\n",
|
|
1083
|
+
" seg_result = analysis.segmentation_result\n",
|
|
1084
|
+
" \n",
|
|
1085
|
+
" fig = make_subplots(\n",
|
|
1086
|
+
" rows=1, cols=3,\n",
|
|
1087
|
+
" subplot_titles=(\n",
|
|
1088
|
+
" f\"Cluster Visualization (PCA, {viz.explained_variance_ratio:.0%} var)\" \n",
|
|
1089
|
+
" if viz.explained_variance_ratio else \"Cluster Visualization (PCA)\",\n",
|
|
1090
|
+
" \"Segment Sizes\", \"Target Rate\"\n",
|
|
1091
|
+
" ),\n",
|
|
1092
|
+
" horizontal_spacing=0.12,\n",
|
|
1093
|
+
" column_widths=[0.4, 0.3, 0.3]\n",
|
|
1094
|
+
" )\n",
|
|
1095
|
+
" \n",
|
|
1096
|
+
" unique_labels = sorted(set(seg_result.labels[seg_result.labels >= 0]))\n",
|
|
1097
|
+
" \n",
|
|
1098
|
+
" # Scatter plot - consistent colors by segment ID\n",
|
|
1099
|
+
" for label in unique_labels:\n",
|
|
1100
|
+
" mask = seg_result.labels == label\n",
|
|
1101
|
+
" color = SEGMENT_COLORS.get(label, '#888888')\n",
|
|
1102
|
+
" profile = next((p for p in analysis.profiles if p.segment_id == label), None)\n",
|
|
1103
|
+
" name = f\"Seg {label} (n={profile.size:,})\" if profile else f\"Seg {label}\"\n",
|
|
1104
|
+
" fig.add_trace(go.Scatter(\n",
|
|
1105
|
+
" x=viz.x[mask], y=viz.y[mask], mode='markers',\n",
|
|
1106
|
+
" marker=dict(color=color, size=6, opacity=0.6),\n",
|
|
1107
|
+
" name=name, hovertemplate=f\"{name}<br>PC1: %{{x:.2f}}<br>PC2: %{{y:.2f}}<extra></extra>\"\n",
|
|
1108
|
+
" ), row=1, col=1)\n",
|
|
1109
|
+
" \n",
|
|
1110
|
+
" # Short labels for bar charts (avoid overlap)\n",
|
|
1111
|
+
" bar_labels = [f\"Seg {p.segment_id}\" for p in analysis.profiles]\n",
|
|
1112
|
+
" sizes = [p.size for p in analysis.profiles]\n",
|
|
1113
|
+
" bar_colors = [SEGMENT_COLORS.get(p.segment_id, '#888888') for p in analysis.profiles]\n",
|
|
1114
|
+
" \n",
|
|
1115
|
+
" # Size bars - numbers inside\n",
|
|
1116
|
+
" fig.add_trace(go.Bar(\n",
|
|
1117
|
+
" y=bar_labels, x=sizes, orientation='h',\n",
|
|
1118
|
+
" marker_color=bar_colors,\n",
|
|
1119
|
+
" text=[f\"{s:,}\" for s in sizes],\n",
|
|
1120
|
+
" textposition='inside', textfont=dict(color='white'),\n",
|
|
1121
|
+
" showlegend=False,\n",
|
|
1122
|
+
" hovertemplate=\"Segment %{y}<br>Count: %{x:,}<extra></extra>\"\n",
|
|
1123
|
+
" ), row=1, col=2)\n",
|
|
1124
|
+
" \n",
|
|
1125
|
+
" # Target rates - consistent segment colors, numbers inside\n",
|
|
1126
|
+
" if all(p.target_rate is not None for p in analysis.profiles):\n",
|
|
1127
|
+
" rates = [p.target_rate * 100 for p in analysis.profiles]\n",
|
|
1128
|
+
" fig.add_trace(go.Bar(\n",
|
|
1129
|
+
" y=bar_labels, x=rates, orientation='h',\n",
|
|
1130
|
+
" marker_color=bar_colors, # Same colors as size chart\n",
|
|
1131
|
+
" text=[f\"{r:.1f}%\" for r in rates],\n",
|
|
1132
|
+
" textposition='inside', textfont=dict(color='white'),\n",
|
|
1133
|
+
" showlegend=False,\n",
|
|
1134
|
+
" hovertemplate=\"Segment %{y}<br>Target: %{x:.1f}%<extra></extra>\"\n",
|
|
1135
|
+
" ), row=1, col=3)\n",
|
|
1136
|
+
" overall = sum(p.target_rate * p.size for p in analysis.profiles) / sd['total'] * 100\n",
|
|
1137
|
+
" fig.add_vline(x=overall, line_dash=\"dash\", line_color=\"#2c3e50\",\n",
|
|
1138
|
+
" annotation_text=f\"Avg: {overall:.1f}%\", annotation_position=\"top\", row=1, col=3)\n",
|
|
1139
|
+
" \n",
|
|
1140
|
+
" fig.update_layout(\n",
|
|
1141
|
+
" title=\"Segment Analysis Overview\", \n",
|
|
1142
|
+
" height=400, \n",
|
|
1143
|
+
" template=\"plotly_white\",\n",
|
|
1144
|
+
" legend=dict(\n",
|
|
1145
|
+
" orientation=\"h\", \n",
|
|
1146
|
+
" yanchor=\"top\", \n",
|
|
1147
|
+
" y=-0.15,\n",
|
|
1148
|
+
" xanchor=\"center\", \n",
|
|
1149
|
+
" x=0.5\n",
|
|
1150
|
+
" ),\n",
|
|
1151
|
+
" margin=dict(r=20, b=80)\n",
|
|
1152
|
+
" )\n",
|
|
1153
|
+
" fig.update_xaxes(title_text=\"PC1\", row=1, col=1)\n",
|
|
1154
|
+
" fig.update_yaxes(title_text=\"PC2\", row=1, col=1)\n",
|
|
1155
|
+
" display_figure(fig)\n",
|
|
1156
|
+
" \n",
|
|
1157
|
+
" print(f\"\\n📈 CLUSTER VISUALIZATION:\")\n",
|
|
1158
|
+
" print(f\" Method: PCA | Variance Explained: {viz.explained_variance_ratio:.1%}\" if viz.explained_variance_ratio else \" Method: PCA\")\n",
|
|
1159
|
+
" print(f\" Colors: Seg 0=Blue, Seg 1=Red, Seg 2=Green, Seg 3=Purple\")\n",
|
|
1160
|
+
" \n",
|
|
1161
|
+
" # ============================================================\n",
|
|
1162
|
+
" # EPV CAPACITY ANALYSIS\n",
|
|
1163
|
+
" # ============================================================\n",
|
|
1164
|
+
" if m.n_segments > 1 and primary_info.target_column:\n",
|
|
1165
|
+
" print(\"\\n\" + \"=\" * 70)\n",
|
|
1166
|
+
" print(\"💡 SEGMENT CAPACITY ANALYSIS (EPV Check)\")\n",
|
|
1167
|
+
" print(\"=\" * 70)\n",
|
|
1168
|
+
" \n",
|
|
1169
|
+
" primary_df['_segment'] = analysis.segmentation_result.labels\n",
|
|
1170
|
+
" capacity = capacity_analyzer.analyze_segment_capacity(\n",
|
|
1171
|
+
" primary_df[primary_df['_segment'] >= 0],\n",
|
|
1172
|
+
" feature_cols=numeric_features,\n",
|
|
1173
|
+
" target_col=primary_info.target_column,\n",
|
|
1174
|
+
" segment_col='_segment'\n",
|
|
1175
|
+
" )\n",
|
|
1176
|
+
" primary_df.drop('_segment', axis=1, inplace=True)\n",
|
|
1177
|
+
" \n",
|
|
1178
|
+
" print(f\"\\n🎯 Strategy: {capacity.recommended_strategy.upper()}\")\n",
|
|
1179
|
+
" print(f\" Reason: {capacity.strategy_reason}\")\n",
|
|
1180
|
+
" if capacity.viable_segments:\n",
|
|
1181
|
+
" print(f\"\\n ✅ Viable segments: {capacity.viable_segments}\")\n",
|
|
1182
|
+
" if capacity.insufficient_segments:\n",
|
|
1183
|
+
" print(f\" ⚠️ Insufficient segments: {capacity.insufficient_segments}\")\n",
|
|
1184
|
+
" \n",
|
|
1185
|
+
" # Store in findings\n",
|
|
1186
|
+
" multi.notes.update({\n",
|
|
1187
|
+
" 'segmentation_recommendation': m.recommendation,\n",
|
|
1188
|
+
" 'segmentation_confidence': m.confidence,\n",
|
|
1189
|
+
" 'segmentation_silhouette': m.silhouette_score,\n",
|
|
1190
|
+
" 'segment_count': m.n_segments,\n",
|
|
1191
|
+
" 'segment_strategy': capacity.recommended_strategy,\n",
|
|
1192
|
+
" 'segment_sizes': {f\"segment_{p.segment_id}\": p.size for p in analysis.profiles}\n",
|
|
1193
|
+
" })\n",
|
|
1194
|
+
" \n",
|
|
1195
|
+
" # Initialize bronze layer if not already done\n",
|
|
1196
|
+
" if registry.bronze is None:\n",
|
|
1197
|
+
" registry.init_bronze(primary_info.source_path)\n",
|
|
1198
|
+
" \n",
|
|
1199
|
+
" # Persist segmentation strategy to registry\n",
|
|
1200
|
+
" registry.add_bronze_segmentation_strategy(\n",
|
|
1201
|
+
" strategy=m.recommendation,\n",
|
|
1202
|
+
" confidence=m.confidence,\n",
|
|
1203
|
+
" n_segments=m.n_segments,\n",
|
|
1204
|
+
" silhouette_score=m.silhouette_score,\n",
|
|
1205
|
+
" rationale=\"; \".join(m.rationale[:3]),\n",
|
|
1206
|
+
" source_notebook=\"05_multi_dataset\"\n",
|
|
1207
|
+
" )\n",
|
|
1208
|
+
" print(f\"\\n✅ Persisted segmentation strategy to registry: {m.recommendation}\")\n",
|
|
1209
|
+
" \n",
|
|
1210
|
+
" # ============================================================\n",
|
|
1211
|
+
" # DECISION SUMMARY\n",
|
|
1212
|
+
" # ============================================================\n",
|
|
1213
|
+
" print(\"\\n\" + \"=\" * 70)\n",
|
|
1214
|
+
" print(\"📋 SEGMENTATION DECISION SUMMARY\")\n",
|
|
1215
|
+
" print(\"=\" * 70)\n",
|
|
1216
|
+
" print(f\"\\n{analysis.get_decision_summary()}\")\n",
|
|
1217
|
+
"\n",
|
|
1218
|
+
"else:\n",
|
|
1219
|
+
" print(\"\\n⚠️ No primary entity dataset detected.\")"
|
|
1220
|
+
]
|
|
1221
|
+
},
|
|
1222
|
+
{
|
|
1223
|
+
"cell_type": "markdown",
|
|
1224
|
+
"id": "f3809b84",
|
|
1225
|
+
"metadata": {
|
|
1226
|
+
"papermill": {
|
|
1227
|
+
"duration": 0.004145,
|
|
1228
|
+
"end_time": "2026-02-02T13:03:23.771952",
|
|
1229
|
+
"exception": false,
|
|
1230
|
+
"start_time": "2026-02-02T13:03:23.767807",
|
|
1231
|
+
"status": "completed"
|
|
1232
|
+
},
|
|
1233
|
+
"tags": []
|
|
1234
|
+
},
|
|
1235
|
+
"source": [
|
|
1236
|
+
"## 5.8 Relationship Diagram"
|
|
1237
|
+
]
|
|
1238
|
+
},
|
|
1239
|
+
{
|
|
1240
|
+
"cell_type": "code",
|
|
1241
|
+
"execution_count": null,
|
|
1242
|
+
"id": "14dda2db",
|
|
1243
|
+
"metadata": {
|
|
1244
|
+
"execution": {
|
|
1245
|
+
"iopub.execute_input": "2026-02-02T13:03:23.780518Z",
|
|
1246
|
+
"iopub.status.busy": "2026-02-02T13:03:23.780420Z",
|
|
1247
|
+
"iopub.status.idle": "2026-02-02T13:03:23.783657Z",
|
|
1248
|
+
"shell.execute_reply": "2026-02-02T13:03:23.783152Z"
|
|
1249
|
+
},
|
|
1250
|
+
"papermill": {
|
|
1251
|
+
"duration": 0.008447,
|
|
1252
|
+
"end_time": "2026-02-02T13:03:23.784142",
|
|
1253
|
+
"exception": false,
|
|
1254
|
+
"start_time": "2026-02-02T13:03:23.775695",
|
|
1255
|
+
"status": "completed"
|
|
1256
|
+
},
|
|
1257
|
+
"tags": []
|
|
1258
|
+
},
|
|
1259
|
+
"outputs": [],
|
|
1260
|
+
"source": [
|
|
1261
|
+
"# Create a simple relationship diagram\n",
|
|
1262
|
+
"if len(multi.datasets) > 1:\n",
|
|
1263
|
+
" print(\"\\n\" + \"=\"*70)\n",
|
|
1264
|
+
" print(\"DATASET RELATIONSHIP DIAGRAM\")\n",
|
|
1265
|
+
" print(\"=\"*70 + \"\\n\")\n",
|
|
1266
|
+
" \n",
|
|
1267
|
+
" # ASCII diagram\n",
|
|
1268
|
+
" if multi.primary_entity_dataset:\n",
|
|
1269
|
+
" primary = multi.primary_entity_dataset\n",
|
|
1270
|
+
" primary_info = multi.datasets[primary]\n",
|
|
1271
|
+
" \n",
|
|
1272
|
+
" print(f\" +{'='*30}+\")\n",
|
|
1273
|
+
" print(f\" | {primary:^26} | <- PRIMARY (has target)\")\n",
|
|
1274
|
+
" print(f\" | {primary_info.row_count:,} rows{' '*15} |\")\n",
|
|
1275
|
+
" if primary_info.target_column:\n",
|
|
1276
|
+
" print(f\" | Target: {primary_info.target_column:<17} |\")\n",
|
|
1277
|
+
" print(f\" +{'='*30}+\")\n",
|
|
1278
|
+
" \n",
|
|
1279
|
+
" for event_name in multi.event_datasets:\n",
|
|
1280
|
+
" event_info = multi.datasets[event_name]\n",
|
|
1281
|
+
" join_col = event_info.entity_column or \"?\"\n",
|
|
1282
|
+
" \n",
|
|
1283
|
+
" print(f\" |\")\n",
|
|
1284
|
+
" print(f\" | {join_col}\")\n",
|
|
1285
|
+
" print(f\" v\")\n",
|
|
1286
|
+
" print(f\" +{'-'*30}+\")\n",
|
|
1287
|
+
" print(f\" | {event_name:^26} | <- EVENT LEVEL\")\n",
|
|
1288
|
+
" print(f\" | {event_info.row_count:,} rows{' '*15} |\")\n",
|
|
1289
|
+
" print(f\" | Time: {event_info.time_column or '?':<19} |\")\n",
|
|
1290
|
+
" print(f\" +{'-'*30}+\")\n",
|
|
1291
|
+
"else:\n",
|
|
1292
|
+
" print(\"Single dataset - no relationships to diagram.\")"
|
|
1293
|
+
]
|
|
1294
|
+
},
|
|
1295
|
+
{
|
|
1296
|
+
"cell_type": "markdown",
|
|
1297
|
+
"id": "7d599131",
|
|
1298
|
+
"metadata": {
|
|
1299
|
+
"papermill": {
|
|
1300
|
+
"duration": 0.003691,
|
|
1301
|
+
"end_time": "2026-02-02T13:03:23.792016",
|
|
1302
|
+
"exception": false,
|
|
1303
|
+
"start_time": "2026-02-02T13:03:23.788325",
|
|
1304
|
+
"status": "completed"
|
|
1305
|
+
},
|
|
1306
|
+
"tags": []
|
|
1307
|
+
},
|
|
1308
|
+
"source": [
|
|
1309
|
+
"## 5.9 Save Multi-Dataset Findings"
|
|
1310
|
+
]
|
|
1311
|
+
},
|
|
1312
|
+
{
|
|
1313
|
+
"cell_type": "code",
|
|
1314
|
+
"execution_count": null,
|
|
1315
|
+
"id": "cfbb90c4",
|
|
1316
|
+
"metadata": {
|
|
1317
|
+
"execution": {
|
|
1318
|
+
"iopub.execute_input": "2026-02-02T13:03:23.801511Z",
|
|
1319
|
+
"iopub.status.busy": "2026-02-02T13:03:23.801385Z",
|
|
1320
|
+
"iopub.status.idle": "2026-02-02T13:03:23.805961Z",
|
|
1321
|
+
"shell.execute_reply": "2026-02-02T13:03:23.805444Z"
|
|
1322
|
+
},
|
|
1323
|
+
"papermill": {
|
|
1324
|
+
"duration": 0.009851,
|
|
1325
|
+
"end_time": "2026-02-02T13:03:23.806470",
|
|
1326
|
+
"exception": false,
|
|
1327
|
+
"start_time": "2026-02-02T13:03:23.796619",
|
|
1328
|
+
"status": "completed"
|
|
1329
|
+
},
|
|
1330
|
+
"tags": []
|
|
1331
|
+
},
|
|
1332
|
+
"outputs": [],
|
|
1333
|
+
"source": [
|
|
1334
|
+
"# Save the multi-dataset findings and recommendations registry\n",
|
|
1335
|
+
"MULTI_FINDINGS_PATH = FINDINGS_DIR / \"multi_dataset_findings.yaml\"\n",
|
|
1336
|
+
"\n",
|
|
1337
|
+
"multi.save(str(MULTI_FINDINGS_PATH))\n",
|
|
1338
|
+
"\n",
|
|
1339
|
+
"# Save the recommendations registry\n",
|
|
1340
|
+
"registry.save(RECOMMENDATIONS_PATH)\n",
|
|
1341
|
+
"\n",
|
|
1342
|
+
"print(f\"\\n✅ Multi-dataset findings saved to: {MULTI_FINDINGS_PATH}\")\n",
|
|
1343
|
+
"print(f\"\\n Contents:\")\n",
|
|
1344
|
+
"print(f\" - {len(multi.datasets)} datasets\")\n",
|
|
1345
|
+
"print(f\" - {len(multi.relationships)} relationships\")\n",
|
|
1346
|
+
"print(f\" - {len(multi.event_datasets)} event datasets to aggregate\")\n",
|
|
1347
|
+
"print(f\" - Aggregation windows: {multi.aggregation_windows}\")\n",
|
|
1348
|
+
"\n",
|
|
1349
|
+
"print(f\"\\n✅ Recommendations registry saved: {RECOMMENDATIONS_PATH}\")\n",
|
|
1350
|
+
"print(f\" Total recommendations: {len(registry.all_recommendations)}\")\n"
|
|
1351
|
+
]
|
|
1352
|
+
},
|
|
1353
|
+
{
|
|
1354
|
+
"cell_type": "markdown",
|
|
1355
|
+
"id": "7b895a1b",
|
|
1356
|
+
"metadata": {
|
|
1357
|
+
"papermill": {
|
|
1358
|
+
"duration": 0.005196,
|
|
1359
|
+
"end_time": "2026-02-02T13:03:23.816347",
|
|
1360
|
+
"exception": false,
|
|
1361
|
+
"start_time": "2026-02-02T13:03:23.811151",
|
|
1362
|
+
"status": "completed"
|
|
1363
|
+
},
|
|
1364
|
+
"tags": []
|
|
1365
|
+
},
|
|
1366
|
+
"source": [
|
|
1367
|
+
"---\n",
|
|
1368
|
+
"\n",
|
|
1369
|
+
"## Summary: What We Learned\n",
|
|
1370
|
+
"\n",
|
|
1371
|
+
"In this notebook, we:\n",
|
|
1372
|
+
"\n",
|
|
1373
|
+
"1. **Discovered Datasets** - Found all exploration findings from previous notebooks\n",
|
|
1374
|
+
"2. **Visualized Overview** - Dashboard showing dataset sizes and structure\n",
|
|
1375
|
+
"3. **Selected Datasets** - Optionally filtered to specific datasets\n",
|
|
1376
|
+
"4. **Defined Relationships** - Established how datasets connect via keys\n",
|
|
1377
|
+
"5. **Planned Temporal Features** - Configured 7 feature groups (lagged windows, velocity, acceleration, lifecycle, recency, regularity, cohort comparison)\n",
|
|
1378
|
+
"6. **Previewed Features** - Saw what features will be created\n",
|
|
1379
|
+
"7. **Analyzed Segmentation** - Determined if segmented modeling is justified\n",
|
|
1380
|
+
"8. **Saved Configuration** - Saved multi-dataset findings for feature engineering\n",
|
|
1381
|
+
"\n",
|
|
1382
|
+
"## Key Decisions Made\n",
|
|
1383
|
+
"\n",
|
|
1384
|
+
"| Decision | Value | Rationale |\n",
|
|
1385
|
+
"|----------|-------|-----------|\n",
|
|
1386
|
+
"| Primary Entity Dataset | From `multi.primary_entity_dataset` | Has target column |\n",
|
|
1387
|
+
"| Event Datasets | From `multi.event_datasets` | Event-level data to aggregate |\n",
|
|
1388
|
+
"| Reference Mode | Per-customer alignment | Makes historical churners comparable to active customers |\n",
|
|
1389
|
+
"| Lag Windows | `NUM_LAGS` x `LAG_WINDOW_DAYS` days | Captures recent vs historical patterns |\n",
|
|
1390
|
+
"| Feature Groups | Lagged, Velocity, Acceleration, Lifecycle, Recency, Regularity, Cohort | Comprehensive temporal characterization |\n",
|
|
1391
|
+
"| Segmentation Strategy | From `multi.notes['segment_strategy']` | Based on EPV analysis |\n",
|
|
1392
|
+
"\n",
|
|
1393
|
+
"---\n",
|
|
1394
|
+
"\n",
|
|
1395
|
+
"## Next Steps\n",
|
|
1396
|
+
"\n",
|
|
1397
|
+
"Continue to **06_feature_opportunities.ipynb** to:\n",
|
|
1398
|
+
"- Deep dive into feature engineering opportunities\n",
|
|
1399
|
+
"- Analyze feature capacity constraints\n",
|
|
1400
|
+
"- Create derived features\n",
|
|
1401
|
+
"\n",
|
|
1402
|
+
"**Important:** The multi-dataset findings file (`multi_dataset_findings.yaml`) includes temporal feature configuration for use in subsequent notebooks."
|
|
1403
|
+
]
|
|
1404
|
+
},
|
|
1405
|
+
{
|
|
1406
|
+
"cell_type": "markdown",
|
|
1407
|
+
"id": "b37da519",
|
|
1408
|
+
"metadata": {
|
|
1409
|
+
"papermill": {
|
|
1410
|
+
"duration": 0.004561,
|
|
1411
|
+
"end_time": "2026-02-02T13:03:23.841911",
|
|
1412
|
+
"exception": false,
|
|
1413
|
+
"start_time": "2026-02-02T13:03:23.837350",
|
|
1414
|
+
"status": "completed"
|
|
1415
|
+
},
|
|
1416
|
+
"tags": []
|
|
1417
|
+
},
|
|
1418
|
+
"source": [
|
|
1419
|
+
"> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
|
|
1420
|
+
"> The next notebook will automatically export this notebook's HTML documentation from the saved file."
|
|
1421
|
+
]
|
|
1422
|
+
}
|
|
1423
|
+
],
|
|
1424
|
+
"metadata": {
|
|
1425
|
+
"kernelspec": {
|
|
1426
|
+
"display_name": "Python 3",
|
|
1427
|
+
"language": "python",
|
|
1428
|
+
"name": "python3"
|
|
1429
|
+
},
|
|
1430
|
+
"language_info": {
|
|
1431
|
+
"codemirror_mode": {
|
|
1432
|
+
"name": "ipython",
|
|
1433
|
+
"version": 3
|
|
1434
|
+
},
|
|
1435
|
+
"file_extension": ".py",
|
|
1436
|
+
"mimetype": "text/x-python",
|
|
1437
|
+
"name": "python",
|
|
1438
|
+
"nbconvert_exporter": "python",
|
|
1439
|
+
"pygments_lexer": "ipython3",
|
|
1440
|
+
"version": "3.12.4"
|
|
1441
|
+
},
|
|
1442
|
+
"papermill": {
|
|
1443
|
+
"default_parameters": {},
|
|
1444
|
+
"duration": 5.931034,
|
|
1445
|
+
"end_time": "2026-02-02T13:03:26.462964",
|
|
1446
|
+
"environment_variables": {},
|
|
1447
|
+
"exception": null,
|
|
1448
|
+
"input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/05_multi_dataset.ipynb",
|
|
1449
|
+
"output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/05_multi_dataset.ipynb",
|
|
1450
|
+
"parameters": {},
|
|
1451
|
+
"start_time": "2026-02-02T13:03:20.531930",
|
|
1452
|
+
"version": "2.6.0"
|
|
1453
|
+
}
|
|
1454
|
+
},
|
|
1455
|
+
"nbformat": 4,
|
|
1456
|
+
"nbformat_minor": 5
|
|
1457
|
+
}
|