churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,1690 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "b25b3276",
|
|
6
|
+
"metadata": {
|
|
7
|
+
"papermill": {
|
|
8
|
+
"duration": 0.004672,
|
|
9
|
+
"end_time": "2026-02-02T13:00:43.539546",
|
|
10
|
+
"exception": false,
|
|
11
|
+
"start_time": "2026-02-02T13:00:43.534874",
|
|
12
|
+
"status": "completed"
|
|
13
|
+
},
|
|
14
|
+
"tags": []
|
|
15
|
+
},
|
|
16
|
+
"source": [
|
|
17
|
+
"# Chapter 1a: Temporal Deep Dive (Event Bronze Track)\n",
|
|
18
|
+
"\n",
|
|
19
|
+
"**Purpose:** Analyze event-level (time series) datasets with focus on temporal patterns, entity lifecycles, and event frequency distributions.\n",
|
|
20
|
+
"\n",
|
|
21
|
+
"**When to use this notebook:**\n",
|
|
22
|
+
"- Your dataset was detected as `EVENT_LEVEL` granularity in notebook 01\n",
|
|
23
|
+
"- You have multiple rows per entity (customer, user, etc.)\n",
|
|
24
|
+
"- Each row represents an event with a timestamp\n",
|
|
25
|
+
"\n",
|
|
26
|
+
"**What you'll learn:**\n",
|
|
27
|
+
"- How to profile entity lifecycles (first event, last event, duration)\n",
|
|
28
|
+
"- Understanding event frequency distributions per entity\n",
|
|
29
|
+
"- Inter-event timing patterns and their implications\n",
|
|
30
|
+
"- Time series-specific feature engineering opportunities\n",
|
|
31
|
+
"\n",
|
|
32
|
+
"**Outputs:**\n",
|
|
33
|
+
"- Entity lifecycle visualizations\n",
|
|
34
|
+
"- Event frequency distribution analysis\n",
|
|
35
|
+
"- Inter-event timing statistics\n",
|
|
36
|
+
"- Updated exploration findings with time series metadata\n",
|
|
37
|
+
"\n",
|
|
38
|
+
"---\n",
|
|
39
|
+
"\n",
|
|
40
|
+
"## Understanding Time Series Profiling\n",
|
|
41
|
+
"\n",
|
|
42
|
+
"| Metric | Description | Why It Matters |\n",
|
|
43
|
+
"|--------|-------------|----------------|\n",
|
|
44
|
+
"| **Events per Entity** | Distribution of event counts | Identifies power users vs. one-time users |\n",
|
|
45
|
+
"| **Entity Lifecycle** | Duration from first to last event | Reveals customer tenure patterns |\n",
|
|
46
|
+
"| **Inter-event Time** | Time between consecutive events | Indicates engagement patterns |\n",
|
|
47
|
+
"| **Time Span** | Overall data period coverage | Helps plan time window aggregations |\n",
|
|
48
|
+
"\n",
|
|
49
|
+
"**Aggregation Windows (used in notebook 01d):**\n",
|
|
50
|
+
"- 24h: Very recent activity\n",
|
|
51
|
+
"- 7d: Weekly patterns\n",
|
|
52
|
+
"- 30d: Monthly patterns\n",
|
|
53
|
+
"- 90d: Quarterly trends\n",
|
|
54
|
+
"- 180d: Semi-annual patterns\n",
|
|
55
|
+
"- 365d: Annual patterns\n",
|
|
56
|
+
"- all-time: Historical totals"
|
|
57
|
+
]
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"cell_type": "markdown",
|
|
61
|
+
"id": "b77d73c3",
|
|
62
|
+
"metadata": {
|
|
63
|
+
"papermill": {
|
|
64
|
+
"duration": 0.00563,
|
|
65
|
+
"end_time": "2026-02-02T13:00:43.560252",
|
|
66
|
+
"exception": false,
|
|
67
|
+
"start_time": "2026-02-02T13:00:43.554622",
|
|
68
|
+
"status": "completed"
|
|
69
|
+
},
|
|
70
|
+
"tags": []
|
|
71
|
+
},
|
|
72
|
+
"source": [
|
|
73
|
+
"## 1a.1 Load Previous Findings"
|
|
74
|
+
]
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
"cell_type": "code",
|
|
78
|
+
"execution_count": null,
|
|
79
|
+
"id": "f637550c",
|
|
80
|
+
"metadata": {
|
|
81
|
+
"execution": {
|
|
82
|
+
"iopub.execute_input": "2026-02-02T13:00:43.566424Z",
|
|
83
|
+
"iopub.status.busy": "2026-02-02T13:00:43.566300Z",
|
|
84
|
+
"iopub.status.idle": "2026-02-02T13:00:45.358347Z",
|
|
85
|
+
"shell.execute_reply": "2026-02-02T13:00:45.357703Z"
|
|
86
|
+
},
|
|
87
|
+
"papermill": {
|
|
88
|
+
"duration": 1.796066,
|
|
89
|
+
"end_time": "2026-02-02T13:00:45.359167",
|
|
90
|
+
"exception": false,
|
|
91
|
+
"start_time": "2026-02-02T13:00:43.563101",
|
|
92
|
+
"status": "completed"
|
|
93
|
+
},
|
|
94
|
+
"tags": []
|
|
95
|
+
},
|
|
96
|
+
"outputs": [],
|
|
97
|
+
"source": [
|
|
98
|
+
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
99
|
+
"track_and_export_previous(\"01a_temporal_deep_dive.ipynb\")\n",
|
|
100
|
+
"\n",
|
|
101
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
102
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
103
|
+
"from customer_retention.core.config.column_config import ColumnType, DatasetGranularity\n",
|
|
104
|
+
"from customer_retention.stages.profiling import (\n",
|
|
105
|
+
" TimeSeriesProfiler, TimeSeriesProfile,\n",
|
|
106
|
+
" TypeDetector,\n",
|
|
107
|
+
" DistributionAnalyzer, TransformationType,\n",
|
|
108
|
+
")\n",
|
|
109
|
+
"import pandas as pd\n",
|
|
110
|
+
"import numpy as np\n",
|
|
111
|
+
"import plotly.graph_objects as go\n",
|
|
112
|
+
"import plotly.express as px\n",
|
|
113
|
+
"from plotly.subplots import make_subplots\n",
|
|
114
|
+
"from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
|
|
115
|
+
]
|
|
116
|
+
},
|
|
117
|
+
{
|
|
118
|
+
"cell_type": "code",
|
|
119
|
+
"execution_count": null,
|
|
120
|
+
"id": "ec0ff3ea",
|
|
121
|
+
"metadata": {
|
|
122
|
+
"execution": {
|
|
123
|
+
"iopub.execute_input": "2026-02-02T13:00:45.365797Z",
|
|
124
|
+
"iopub.status.busy": "2026-02-02T13:00:45.365667Z",
|
|
125
|
+
"iopub.status.idle": "2026-02-02T13:00:45.391020Z",
|
|
126
|
+
"shell.execute_reply": "2026-02-02T13:00:45.390535Z"
|
|
127
|
+
},
|
|
128
|
+
"papermill": {
|
|
129
|
+
"duration": 0.02932,
|
|
130
|
+
"end_time": "2026-02-02T13:00:45.391503",
|
|
131
|
+
"exception": false,
|
|
132
|
+
"start_time": "2026-02-02T13:00:45.362183",
|
|
133
|
+
"status": "completed"
|
|
134
|
+
},
|
|
135
|
+
"tags": []
|
|
136
|
+
},
|
|
137
|
+
"outputs": [],
|
|
138
|
+
"source": [
|
|
139
|
+
"# === CONFIGURATION ===\n",
|
|
140
|
+
"# Option 1: Set the exact path from notebook 01 output\n",
|
|
141
|
+
"# FINDINGS_PATH = \"../experiments/findings/transactions_abc123_findings.yaml\"\n",
|
|
142
|
+
"\n",
|
|
143
|
+
"# Option 2: Auto-discover findings files\n",
|
|
144
|
+
"from pathlib import Path\n",
|
|
145
|
+
"\n",
|
|
146
|
+
"# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
|
|
147
|
+
"\n",
|
|
148
|
+
"# Find all findings files\n",
|
|
149
|
+
"findings_files = [f for f in FINDINGS_DIR.glob(\"*_findings.yaml\") if \"multi_dataset\" not in f.name]\n",
|
|
150
|
+
"if not findings_files:\n",
|
|
151
|
+
" raise FileNotFoundError(f\"No findings files found in {FINDINGS_DIR}. Run notebook 01 first.\")\n",
|
|
152
|
+
"\n",
|
|
153
|
+
"# Sort by modification time (most recent first)\n",
|
|
154
|
+
"findings_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
155
|
+
"FINDINGS_PATH = str(findings_files[0])\n",
|
|
156
|
+
"\n",
|
|
157
|
+
"print(f\"Found {len(findings_files)} findings file(s)\")\n",
|
|
158
|
+
"print(f\"Using: {FINDINGS_PATH}\")\n",
|
|
159
|
+
"if len(findings_files) > 1:\n",
|
|
160
|
+
" print(f\"Other available: {[str(f.name) for f in findings_files[1:3]]}\")\n",
|
|
161
|
+
"\n",
|
|
162
|
+
"findings = ExplorationFindings.load(FINDINGS_PATH)\n",
|
|
163
|
+
"print(f\"\\nLoaded findings for {findings.column_count} columns from {findings.source_path}\")"
|
|
164
|
+
]
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
"cell_type": "code",
|
|
168
|
+
"execution_count": null,
|
|
169
|
+
"id": "1cb51799",
|
|
170
|
+
"metadata": {
|
|
171
|
+
"execution": {
|
|
172
|
+
"iopub.execute_input": "2026-02-02T13:00:45.397630Z",
|
|
173
|
+
"iopub.status.busy": "2026-02-02T13:00:45.397531Z",
|
|
174
|
+
"iopub.status.idle": "2026-02-02T13:00:45.399987Z",
|
|
175
|
+
"shell.execute_reply": "2026-02-02T13:00:45.399515Z"
|
|
176
|
+
},
|
|
177
|
+
"papermill": {
|
|
178
|
+
"duration": 0.006267,
|
|
179
|
+
"end_time": "2026-02-02T13:00:45.400468",
|
|
180
|
+
"exception": false,
|
|
181
|
+
"start_time": "2026-02-02T13:00:45.394201",
|
|
182
|
+
"status": "completed"
|
|
183
|
+
},
|
|
184
|
+
"tags": []
|
|
185
|
+
},
|
|
186
|
+
"outputs": [],
|
|
187
|
+
"source": [
|
|
188
|
+
"# Verify this is a time series dataset\n",
|
|
189
|
+
"if findings.is_time_series:\n",
|
|
190
|
+
" ts_meta = findings.time_series_metadata\n",
|
|
191
|
+
" temporal_pattern = (ts_meta.temporal_pattern or \"unknown\").upper()\n",
|
|
192
|
+
" print(f\"\\u2705 Dataset confirmed as {temporal_pattern} (event-level)\")\n",
|
|
193
|
+
" print(f\" Entity column: {ts_meta.entity_column}\")\n",
|
|
194
|
+
" print(f\" Time column: {ts_meta.time_column}\")\n",
|
|
195
|
+
" print(f\" Avg events per entity: {ts_meta.avg_events_per_entity:.1f}\" if ts_meta.avg_events_per_entity else \"\")\n",
|
|
196
|
+
"else:\n",
|
|
197
|
+
" print(\"\\u26a0\\ufe0f This dataset was NOT detected as time series.\")\n",
|
|
198
|
+
" print(\" Consider using 02_column_deep_dive.ipynb instead.\")\n",
|
|
199
|
+
" print(\" Or manually specify entity and time columns below.\")"
|
|
200
|
+
]
|
|
201
|
+
},
|
|
202
|
+
{
|
|
203
|
+
"cell_type": "markdown",
|
|
204
|
+
"id": "da320551",
|
|
205
|
+
"metadata": {
|
|
206
|
+
"papermill": {
|
|
207
|
+
"duration": 0.002516,
|
|
208
|
+
"end_time": "2026-02-02T13:00:45.405745",
|
|
209
|
+
"exception": false,
|
|
210
|
+
"start_time": "2026-02-02T13:00:45.403229",
|
|
211
|
+
"status": "completed"
|
|
212
|
+
},
|
|
213
|
+
"tags": []
|
|
214
|
+
},
|
|
215
|
+
"source": [
|
|
216
|
+
"## 1a.2 Load Source Data & Configure Columns"
|
|
217
|
+
]
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
"cell_type": "code",
|
|
221
|
+
"execution_count": null,
|
|
222
|
+
"id": "bf93640f",
|
|
223
|
+
"metadata": {
|
|
224
|
+
"execution": {
|
|
225
|
+
"iopub.execute_input": "2026-02-02T13:00:45.411398Z",
|
|
226
|
+
"iopub.status.busy": "2026-02-02T13:00:45.411286Z",
|
|
227
|
+
"iopub.status.idle": "2026-02-02T13:00:46.049065Z",
|
|
228
|
+
"shell.execute_reply": "2026-02-02T13:00:46.048415Z"
|
|
229
|
+
},
|
|
230
|
+
"papermill": {
|
|
231
|
+
"duration": 0.641436,
|
|
232
|
+
"end_time": "2026-02-02T13:00:46.049611",
|
|
233
|
+
"exception": false,
|
|
234
|
+
"start_time": "2026-02-02T13:00:45.408175",
|
|
235
|
+
"status": "completed"
|
|
236
|
+
},
|
|
237
|
+
"tags": []
|
|
238
|
+
},
|
|
239
|
+
"outputs": [],
|
|
240
|
+
"source": [
|
|
241
|
+
"from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
|
|
242
|
+
"\n",
|
|
243
|
+
"df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
|
|
244
|
+
"charts = ChartBuilder()\n",
|
|
245
|
+
"\n",
|
|
246
|
+
"print(f\"Loaded {len(df):,} rows x {len(df.columns)} columns\")\n",
|
|
247
|
+
"print(f\"Data source: {data_source}\")"
|
|
248
|
+
]
|
|
249
|
+
},
|
|
250
|
+
{
|
|
251
|
+
"cell_type": "code",
|
|
252
|
+
"execution_count": null,
|
|
253
|
+
"id": "096fd6ce",
|
|
254
|
+
"metadata": {
|
|
255
|
+
"execution": {
|
|
256
|
+
"iopub.execute_input": "2026-02-02T13:00:46.056341Z",
|
|
257
|
+
"iopub.status.busy": "2026-02-02T13:00:46.056051Z",
|
|
258
|
+
"iopub.status.idle": "2026-02-02T13:00:46.059051Z",
|
|
259
|
+
"shell.execute_reply": "2026-02-02T13:00:46.058443Z"
|
|
260
|
+
},
|
|
261
|
+
"papermill": {
|
|
262
|
+
"duration": 0.007123,
|
|
263
|
+
"end_time": "2026-02-02T13:00:46.059703",
|
|
264
|
+
"exception": false,
|
|
265
|
+
"start_time": "2026-02-02T13:00:46.052580",
|
|
266
|
+
"status": "completed"
|
|
267
|
+
},
|
|
268
|
+
"tags": []
|
|
269
|
+
},
|
|
270
|
+
"outputs": [],
|
|
271
|
+
"source": [
|
|
272
|
+
"# === COLUMN CONFIGURATION ===\n",
|
|
273
|
+
"# These will be auto-populated from findings if available\n",
|
|
274
|
+
"# Override manually if needed\n",
|
|
275
|
+
"\n",
|
|
276
|
+
"if findings.is_time_series and findings.time_series_metadata:\n",
|
|
277
|
+
" ENTITY_COLUMN = findings.time_series_metadata.entity_column\n",
|
|
278
|
+
" TIME_COLUMN = findings.time_series_metadata.time_column\n",
|
|
279
|
+
"else:\n",
|
|
280
|
+
" # Manual configuration - uncomment and set if auto-detection failed\n",
|
|
281
|
+
" # ENTITY_COLUMN = \"customer_id\"\n",
|
|
282
|
+
" # TIME_COLUMN = \"event_date\"\n",
|
|
283
|
+
" \n",
|
|
284
|
+
" # Try auto-detection\n",
|
|
285
|
+
" detector = TypeDetector()\n",
|
|
286
|
+
" granularity = detector.detect_granularity(df)\n",
|
|
287
|
+
" ENTITY_COLUMN = granularity.entity_column\n",
|
|
288
|
+
" TIME_COLUMN = granularity.time_column\n",
|
|
289
|
+
"\n",
|
|
290
|
+
"print(f\"Entity column: {ENTITY_COLUMN}\")\n",
|
|
291
|
+
"print(f\"Time column: {TIME_COLUMN}\")\n",
|
|
292
|
+
"\n",
|
|
293
|
+
"if not ENTITY_COLUMN or not TIME_COLUMN:\n",
|
|
294
|
+
" raise ValueError(\"Please set ENTITY_COLUMN and TIME_COLUMN manually above\")"
|
|
295
|
+
]
|
|
296
|
+
},
|
|
297
|
+
{
|
|
298
|
+
"cell_type": "markdown",
|
|
299
|
+
"id": "699bc20a",
|
|
300
|
+
"metadata": {
|
|
301
|
+
"papermill": {
|
|
302
|
+
"duration": 0.002487,
|
|
303
|
+
"end_time": "2026-02-02T13:00:46.064992",
|
|
304
|
+
"exception": false,
|
|
305
|
+
"start_time": "2026-02-02T13:00:46.062505",
|
|
306
|
+
"status": "completed"
|
|
307
|
+
},
|
|
308
|
+
"tags": []
|
|
309
|
+
},
|
|
310
|
+
"source": [
|
|
311
|
+
"## 1a.3 Time Series Profile Overview\n",
|
|
312
|
+
"\n",
|
|
313
|
+
"**What we analyze:**\n",
|
|
314
|
+
"- Total events and unique entities\n",
|
|
315
|
+
"- Time span coverage\n",
|
|
316
|
+
"- Events per entity distribution\n",
|
|
317
|
+
"- Entity lifecycle metrics"
|
|
318
|
+
]
|
|
319
|
+
},
|
|
320
|
+
{
|
|
321
|
+
"cell_type": "code",
|
|
322
|
+
"execution_count": null,
|
|
323
|
+
"id": "50d5d2c0",
|
|
324
|
+
"metadata": {
|
|
325
|
+
"execution": {
|
|
326
|
+
"iopub.execute_input": "2026-02-02T13:00:46.071065Z",
|
|
327
|
+
"iopub.status.busy": "2026-02-02T13:00:46.070953Z",
|
|
328
|
+
"iopub.status.idle": "2026-02-02T13:00:47.135766Z",
|
|
329
|
+
"shell.execute_reply": "2026-02-02T13:00:47.135154Z"
|
|
330
|
+
},
|
|
331
|
+
"papermill": {
|
|
332
|
+
"duration": 1.068734,
|
|
333
|
+
"end_time": "2026-02-02T13:00:47.136462",
|
|
334
|
+
"exception": false,
|
|
335
|
+
"start_time": "2026-02-02T13:00:46.067728",
|
|
336
|
+
"status": "completed"
|
|
337
|
+
},
|
|
338
|
+
"tags": []
|
|
339
|
+
},
|
|
340
|
+
"outputs": [],
|
|
341
|
+
"source": [
|
|
342
|
+
"# Create the time series profiler and run analysis\n",
|
|
343
|
+
"profiler = TimeSeriesProfiler(entity_column=ENTITY_COLUMN, time_column=TIME_COLUMN)\n",
|
|
344
|
+
"ts_profile = profiler.profile(df)\n",
|
|
345
|
+
"\n",
|
|
346
|
+
"print(\"=\"*70)\n",
|
|
347
|
+
"print(\"TIME SERIES PROFILE SUMMARY\")\n",
|
|
348
|
+
"print(\"=\"*70)\n",
|
|
349
|
+
"print(f\"\\n\\U0001f4ca Dataset Overview:\")\n",
|
|
350
|
+
"print(f\" Total Events: {ts_profile.total_events:,}\")\n",
|
|
351
|
+
"print(f\" Unique Entities: {ts_profile.unique_entities:,}\")\n",
|
|
352
|
+
"print(f\" Avg Events/Entity: {ts_profile.events_per_entity.mean:.1f}\")\n",
|
|
353
|
+
"print(f\" Time Span: {ts_profile.time_span_days:,} days ({ts_profile.time_span_days/365:.1f} years)\")\n",
|
|
354
|
+
"\n",
|
|
355
|
+
"print(f\"\\n\\U0001f4c5 Date Range:\")\n",
|
|
356
|
+
"print(f\" First Event: {ts_profile.first_event_date}\")\n",
|
|
357
|
+
"print(f\" Last Event: {ts_profile.last_event_date}\")\n",
|
|
358
|
+
"\n",
|
|
359
|
+
"print(f\"\\n\\u23f1\\ufe0f Inter-Event Timing:\")\n",
|
|
360
|
+
"if ts_profile.avg_inter_event_days is not None:\n",
|
|
361
|
+
" print(f\" Avg Days Between Events: {ts_profile.avg_inter_event_days:.1f}\")\n",
|
|
362
|
+
"else:\n",
|
|
363
|
+
" print(\" Not enough data to compute inter-event timing\")"
|
|
364
|
+
]
|
|
365
|
+
},
|
|
366
|
+
{
|
|
367
|
+
"cell_type": "markdown",
|
|
368
|
+
"id": "ae15eee5",
|
|
369
|
+
"metadata": {
|
|
370
|
+
"papermill": {
|
|
371
|
+
"duration": 0.002416,
|
|
372
|
+
"end_time": "2026-02-02T13:00:47.141577",
|
|
373
|
+
"exception": false,
|
|
374
|
+
"start_time": "2026-02-02T13:00:47.139161",
|
|
375
|
+
"status": "completed"
|
|
376
|
+
},
|
|
377
|
+
"tags": []
|
|
378
|
+
},
|
|
379
|
+
"source": [
|
|
380
|
+
"## 1a.4 Events per Entity Distribution\n",
|
|
381
|
+
"\n",
|
|
382
|
+
"**Goal:** Understand how event volume varies across entities to guide feature engineering and identify modeling challenges.\n",
|
|
383
|
+
"\n",
|
|
384
|
+
"| Segment | Definition | Why It Matters for Modeling |\n",
|
|
385
|
+
"|---------|------------|---------------------------|\n",
|
|
386
|
+
"| **One-time** | Exactly 1 event | No temporal features possible; cold-start problem |\n",
|
|
387
|
+
"| **Low Activity** | Below Q25 | Sparse features, many zeros; log-transform counts |\n",
|
|
388
|
+
"| **Medium Activity** | Q25 to Q75 | Core population; standard aggregation windows work |\n",
|
|
389
|
+
"| **High Activity** | Above Q75 | Rich features; watch for training set dominance |"
|
|
390
|
+
]
|
|
391
|
+
},
|
|
392
|
+
{
|
|
393
|
+
"cell_type": "code",
|
|
394
|
+
"execution_count": null,
|
|
395
|
+
"id": "80e24681",
|
|
396
|
+
"metadata": {
|
|
397
|
+
"execution": {
|
|
398
|
+
"iopub.execute_input": "2026-02-02T13:00:47.147535Z",
|
|
399
|
+
"iopub.status.busy": "2026-02-02T13:00:47.147412Z",
|
|
400
|
+
"iopub.status.idle": "2026-02-02T13:00:47.180702Z",
|
|
401
|
+
"shell.execute_reply": "2026-02-02T13:00:47.180158Z"
|
|
402
|
+
},
|
|
403
|
+
"papermill": {
|
|
404
|
+
"duration": 0.037113,
|
|
405
|
+
"end_time": "2026-02-02T13:00:47.181315",
|
|
406
|
+
"exception": false,
|
|
407
|
+
"start_time": "2026-02-02T13:00:47.144202",
|
|
408
|
+
"status": "completed"
|
|
409
|
+
},
|
|
410
|
+
"tags": []
|
|
411
|
+
},
|
|
412
|
+
"outputs": [],
|
|
413
|
+
"source": [
|
|
414
|
+
"from customer_retention.stages.profiling import classify_activity_segments\n",
|
|
415
|
+
"\n",
|
|
416
|
+
"segment_result = classify_activity_segments(ts_profile.entity_lifecycles)\n",
|
|
417
|
+
"\n",
|
|
418
|
+
"segment_order = [\"One-time\", \"Low Activity\", \"Medium Activity\", \"High Activity\"]\n",
|
|
419
|
+
"segment_colors = {\n",
|
|
420
|
+
" \"One-time\": \"#d62728\", \"Low Activity\": \"#ff7f0e\",\n",
|
|
421
|
+
" \"Medium Activity\": \"#2ca02c\", \"High Activity\": \"#1f77b4\",\n",
|
|
422
|
+
"}\n",
|
|
423
|
+
"\n",
|
|
424
|
+
"event_counts = segment_result.lifecycles[\"event_count\"]\n",
|
|
425
|
+
"x_max = event_counts.quantile(0.99)\n",
|
|
426
|
+
"bins = np.linspace(0, x_max, 31)\n",
|
|
427
|
+
"bin_centers = (bins[:-1] + bins[1:]) / 2\n",
|
|
428
|
+
"\n",
|
|
429
|
+
"lc = segment_result.lifecycles\n",
|
|
430
|
+
"bin_indices = np.digitize(lc[\"event_count\"], bins) - 1\n",
|
|
431
|
+
"bin_indices = bin_indices.clip(0, len(bin_centers) - 1)\n",
|
|
432
|
+
"lc_binned = lc.assign(_bin=bin_indices)\n",
|
|
433
|
+
"\n",
|
|
434
|
+
"fig = go.Figure()\n",
|
|
435
|
+
"for seg in segment_order:\n",
|
|
436
|
+
" subset = lc_binned[lc_binned[\"activity_segment\"] == seg]\n",
|
|
437
|
+
" if subset.empty:\n",
|
|
438
|
+
" continue\n",
|
|
439
|
+
" counts_per_bin = subset.groupby(\"_bin\").size().reindex(range(len(bin_centers)), fill_value=0)\n",
|
|
440
|
+
" fig.add_trace(go.Bar(\n",
|
|
441
|
+
" x=bin_centers, y=counts_per_bin.values, name=seg,\n",
|
|
442
|
+
" marker_color=segment_colors[seg], opacity=0.85,\n",
|
|
443
|
+
" ))\n",
|
|
444
|
+
"\n",
|
|
445
|
+
"fig.add_vline(\n",
|
|
446
|
+
" x=event_counts.median(), line_dash=\"solid\", line_color=\"gray\",\n",
|
|
447
|
+
" annotation_text=f\"Median: {event_counts.median():.0f}\",\n",
|
|
448
|
+
" annotation_position=\"top left\",\n",
|
|
449
|
+
")\n",
|
|
450
|
+
"\n",
|
|
451
|
+
"use_log_y = event_counts.value_counts().max() > event_counts.value_counts().median() * 50\n",
|
|
452
|
+
"\n",
|
|
453
|
+
"log_note = (\"<br><sub>Log Y-axis: bar heights compress large differences — \"\n",
|
|
454
|
+
" \"see table below for actual segment shares</sub>\" if use_log_y else \"\")\n",
|
|
455
|
+
"\n",
|
|
456
|
+
"fig.update_layout(\n",
|
|
457
|
+
" barmode=\"stack\", template=\"plotly_white\", height=420,\n",
|
|
458
|
+
" title=\"Events per Entity by Activity Segment\" + log_note,\n",
|
|
459
|
+
" xaxis_title=\"Number of Events\",\n",
|
|
460
|
+
" yaxis_title=\"Entities\",\n",
|
|
461
|
+
" yaxis_type=\"log\" if use_log_y else \"linear\",\n",
|
|
462
|
+
" legend=dict(orientation=\"h\", yanchor=\"top\", y=-0.15, xanchor=\"center\", x=0.5),\n",
|
|
463
|
+
" margin=dict(b=70),\n",
|
|
464
|
+
")\n",
|
|
465
|
+
"display_figure(fig)"
|
|
466
|
+
]
|
|
467
|
+
},
|
|
468
|
+
{
|
|
469
|
+
"cell_type": "code",
|
|
470
|
+
"execution_count": null,
|
|
471
|
+
"id": "cb2c1433",
|
|
472
|
+
"metadata": {
|
|
473
|
+
"execution": {
|
|
474
|
+
"iopub.execute_input": "2026-02-02T13:00:47.191417Z",
|
|
475
|
+
"iopub.status.busy": "2026-02-02T13:00:47.191286Z",
|
|
476
|
+
"iopub.status.idle": "2026-02-02T13:00:47.196159Z",
|
|
477
|
+
"shell.execute_reply": "2026-02-02T13:00:47.195708Z"
|
|
478
|
+
},
|
|
479
|
+
"papermill": {
|
|
480
|
+
"duration": 0.011037,
|
|
481
|
+
"end_time": "2026-02-02T13:00:47.196670",
|
|
482
|
+
"exception": false,
|
|
483
|
+
"start_time": "2026-02-02T13:00:47.185633",
|
|
484
|
+
"status": "completed"
|
|
485
|
+
},
|
|
486
|
+
"tags": []
|
|
487
|
+
},
|
|
488
|
+
"outputs": [],
|
|
489
|
+
"source": [
|
|
490
|
+
"print(f\"Segment thresholds: Q25 = {segment_result.q25_threshold:.0f} events, \"\n",
|
|
491
|
+
" f\"Q75 = {segment_result.q75_threshold:.0f} events\\n\")\n",
|
|
492
|
+
"display_table(segment_result.recommendations)"
|
|
493
|
+
]
|
|
494
|
+
},
|
|
495
|
+
{
|
|
496
|
+
"cell_type": "markdown",
|
|
497
|
+
"id": "9edcbf88",
|
|
498
|
+
"metadata": {
|
|
499
|
+
"papermill": {
|
|
500
|
+
"duration": 0.004159,
|
|
501
|
+
"end_time": "2026-02-02T13:00:47.206083",
|
|
502
|
+
"exception": false,
|
|
503
|
+
"start_time": "2026-02-02T13:00:47.201924",
|
|
504
|
+
"status": "completed"
|
|
505
|
+
},
|
|
506
|
+
"tags": []
|
|
507
|
+
},
|
|
508
|
+
"source": [
|
|
509
|
+
"## 1a.5 Entity Lifecycle Analysis\n",
|
|
510
|
+
"\n",
|
|
511
|
+
"**Goal:** Classify entities by their engagement pattern to inform feature engineering and modeling strategy.\n",
|
|
512
|
+
"\n",
|
|
513
|
+
"We combine two dimensions — **tenure** (days from first to last event) and **intensity** (events per day of tenure) — to identify four lifecycle quadrants:\n",
|
|
514
|
+
"\n",
|
|
515
|
+
"| Quadrant | Tenure | Intensity | Meaning | Feature Implication |\n",
|
|
516
|
+
"|----------|--------|-----------|---------|---------------------|\n",
|
|
517
|
+
"| **Intense & Brief** | Short | High | Burst engagement, then gone | Recency features critical |\n",
|
|
518
|
+
"| **Steady & Loyal** | Long | High | Consistent power users | Trend/seasonality features valuable |\n",
|
|
519
|
+
"| **Occasional & Loyal** | Long | Low | Infrequent but persistent | Wider time windows needed |\n",
|
|
520
|
+
"| **One-shot** | Short | Low | Single/few interactions | May lack enough history for features |"
|
|
521
|
+
]
|
|
522
|
+
},
|
|
523
|
+
{
|
|
524
|
+
"cell_type": "code",
|
|
525
|
+
"execution_count": null,
|
|
526
|
+
"id": "b6c52e77",
|
|
527
|
+
"metadata": {
|
|
528
|
+
"execution": {
|
|
529
|
+
"iopub.execute_input": "2026-02-02T13:00:47.215136Z",
|
|
530
|
+
"iopub.status.busy": "2026-02-02T13:00:47.215018Z",
|
|
531
|
+
"iopub.status.idle": "2026-02-02T13:00:47.221586Z",
|
|
532
|
+
"shell.execute_reply": "2026-02-02T13:00:47.221133Z"
|
|
533
|
+
},
|
|
534
|
+
"papermill": {
|
|
535
|
+
"duration": 0.011923,
|
|
536
|
+
"end_time": "2026-02-02T13:00:47.222065",
|
|
537
|
+
"exception": false,
|
|
538
|
+
"start_time": "2026-02-02T13:00:47.210142",
|
|
539
|
+
"status": "completed"
|
|
540
|
+
},
|
|
541
|
+
"tags": []
|
|
542
|
+
},
|
|
543
|
+
"outputs": [],
|
|
544
|
+
"source": [
|
|
545
|
+
"from customer_retention.stages.profiling import classify_lifecycle_quadrants\n",
|
|
546
|
+
"\n",
|
|
547
|
+
"quadrant_result = classify_lifecycle_quadrants(ts_profile.entity_lifecycles)\n",
|
|
548
|
+
"lifecycles = quadrant_result.lifecycles\n",
|
|
549
|
+
"\n",
|
|
550
|
+
"quadrant_order = [\"Steady & Loyal\", \"Occasional & Loyal\", \"Intense & Brief\", \"One-shot\"]\n",
|
|
551
|
+
"quadrant_colors = {\n",
|
|
552
|
+
" \"Steady & Loyal\": \"#2ca02c\", \"Occasional & Loyal\": \"#1f77b4\",\n",
|
|
553
|
+
" \"Intense & Brief\": \"#ff7f0e\", \"One-shot\": \"#d62728\",\n",
|
|
554
|
+
"}\n",
|
|
555
|
+
"tenure_median = quadrant_result.tenure_threshold\n",
|
|
556
|
+
"\n",
|
|
557
|
+
"print(f\"Split thresholds: Tenure median = {quadrant_result.tenure_threshold:.0f} days, \"\n",
|
|
558
|
+
" f\"Intensity median = {quadrant_result.intensity_threshold:.4f} events/day\\n\")\n",
|
|
559
|
+
"display_table(quadrant_result.recommendations)"
|
|
560
|
+
]
|
|
561
|
+
},
|
|
562
|
+
{
|
|
563
|
+
"cell_type": "code",
|
|
564
|
+
"execution_count": null,
|
|
565
|
+
"id": "a3ba015d",
|
|
566
|
+
"metadata": {
|
|
567
|
+
"execution": {
|
|
568
|
+
"iopub.execute_input": "2026-02-02T13:00:47.232407Z",
|
|
569
|
+
"iopub.status.busy": "2026-02-02T13:00:47.232297Z",
|
|
570
|
+
"iopub.status.idle": "2026-02-02T13:00:47.287294Z",
|
|
571
|
+
"shell.execute_reply": "2026-02-02T13:00:47.286736Z"
|
|
572
|
+
},
|
|
573
|
+
"papermill": {
|
|
574
|
+
"duration": 0.061037,
|
|
575
|
+
"end_time": "2026-02-02T13:00:47.288051",
|
|
576
|
+
"exception": false,
|
|
577
|
+
"start_time": "2026-02-02T13:00:47.227014",
|
|
578
|
+
"status": "completed"
|
|
579
|
+
},
|
|
580
|
+
"tags": []
|
|
581
|
+
},
|
|
582
|
+
"outputs": [],
|
|
583
|
+
"source": [
|
|
584
|
+
"# Combined panel: small multiples (top 2x2) + tenure histogram (bottom)\n",
|
|
585
|
+
"fig = make_subplots(\n",
|
|
586
|
+
" rows=3, cols=2,\n",
|
|
587
|
+
" subplot_titles=[*quadrant_order, \"Tenure Distribution by Quadrant\", \"\"],\n",
|
|
588
|
+
" specs=[[{}, {}], [{}, {}], [{\"colspan\": 2}, None]],\n",
|
|
589
|
+
" vertical_spacing=0.08, horizontal_spacing=0.10,\n",
|
|
590
|
+
" row_heights=[0.28, 0.28, 0.44],\n",
|
|
591
|
+
")\n",
|
|
592
|
+
"\n",
|
|
593
|
+
"# Top 2x2: scatter per quadrant\n",
|
|
594
|
+
"positions = [(1, 1), (1, 2), (2, 1), (2, 2)]\n",
|
|
595
|
+
"for (row, col), q in zip(positions, quadrant_order):\n",
|
|
596
|
+
" subset = lifecycles[lifecycles[\"lifecycle_quadrant\"] == q]\n",
|
|
597
|
+
" fig.add_trace(go.Scatter(\n",
|
|
598
|
+
" x=subset[\"duration_days\"], y=subset[\"intensity\"],\n",
|
|
599
|
+
" mode=\"markers\", marker=dict(color=quadrant_colors[q], opacity=0.4, size=3),\n",
|
|
600
|
+
" showlegend=False,\n",
|
|
601
|
+
" ), row=row, col=col)\n",
|
|
602
|
+
" fig.update_xaxes(title_text=\"Tenure (d)\", title_font_size=10, row=row, col=col)\n",
|
|
603
|
+
" fig.update_yaxes(title_text=\"Ev/day\", title_font_size=10, row=row, col=col)\n",
|
|
604
|
+
"\n",
|
|
605
|
+
"# Bottom: overlaid tenure histograms\n",
|
|
606
|
+
"for q in quadrant_order:\n",
|
|
607
|
+
" subset = lifecycles[lifecycles[\"lifecycle_quadrant\"] == q]\n",
|
|
608
|
+
" fig.add_trace(go.Histogram(\n",
|
|
609
|
+
" x=subset[\"duration_days\"], nbinsx=40, name=q,\n",
|
|
610
|
+
" marker_color=quadrant_colors[q], opacity=0.6,\n",
|
|
611
|
+
" ), row=3, col=1)\n",
|
|
612
|
+
"\n",
|
|
613
|
+
"fig.add_vline(x=tenure_median, line_dash=\"dot\", line_color=\"gray\", opacity=0.5,\n",
|
|
614
|
+
" row=3, col=1, annotation_text=f\"Median: {tenure_median:.0f}d\",\n",
|
|
615
|
+
" annotation_position=\"top left\")\n",
|
|
616
|
+
"\n",
|
|
617
|
+
"fig.update_layout(\n",
|
|
618
|
+
" barmode=\"overlay\", template=\"plotly_white\", height=900,\n",
|
|
619
|
+
" title=\"Entity Lifecycle Quadrants\",\n",
|
|
620
|
+
" legend=dict(orientation=\"h\", yanchor=\"top\", y=-0.05, xanchor=\"center\", x=0.5),\n",
|
|
621
|
+
" margin=dict(b=80),\n",
|
|
622
|
+
")\n",
|
|
623
|
+
"fig.update_xaxes(title_text=\"Tenure (days)\", row=3, col=1)\n",
|
|
624
|
+
"fig.update_yaxes(title_text=\"Entities\", row=3, col=1)\n",
|
|
625
|
+
"display_figure(fig)"
|
|
626
|
+
]
|
|
627
|
+
},
|
|
628
|
+
{
|
|
629
|
+
"cell_type": "markdown",
|
|
630
|
+
"id": "3d2820d4",
|
|
631
|
+
"metadata": {
|
|
632
|
+
"papermill": {
|
|
633
|
+
"duration": 0.007238,
|
|
634
|
+
"end_time": "2026-02-02T13:00:47.302328",
|
|
635
|
+
"exception": false,
|
|
636
|
+
"start_time": "2026-02-02T13:00:47.295090",
|
|
637
|
+
"status": "completed"
|
|
638
|
+
},
|
|
639
|
+
"tags": []
|
|
640
|
+
},
|
|
641
|
+
"source": [
|
|
642
|
+
"## 1a.6 Temporal Coverage Analysis\n",
|
|
643
|
+
"\n",
|
|
644
|
+
"**Why this matters for modeling:**\n",
|
|
645
|
+
"\n",
|
|
646
|
+
"| Question | Impact |\n",
|
|
647
|
+
"|----------|--------|\n",
|
|
648
|
+
"| **Data gaps?** | Gaps produce misleading aggregation features — zeros that mean \"no data\" not \"no activity\" |\n",
|
|
649
|
+
"| **Volume trend?** | Growing volume means older entities have sparser history; declining means recent windows are underpopulated |\n",
|
|
650
|
+
"| **Entity coverage by window?** | Shows which aggregation windows will produce meaningful features vs. mostly zeros |\n",
|
|
651
|
+
"| **Entity arrival pattern?** | Concentrated arrivals = cohort effects; steady arrivals = stable population |"
|
|
652
|
+
]
|
|
653
|
+
},
|
|
654
|
+
{
|
|
655
|
+
"cell_type": "code",
|
|
656
|
+
"execution_count": null,
|
|
657
|
+
"id": "8a57d827",
|
|
658
|
+
"metadata": {
|
|
659
|
+
"execution": {
|
|
660
|
+
"iopub.execute_input": "2026-02-02T13:00:47.318548Z",
|
|
661
|
+
"iopub.status.busy": "2026-02-02T13:00:47.318439Z",
|
|
662
|
+
"iopub.status.idle": "2026-02-02T13:00:47.359092Z",
|
|
663
|
+
"shell.execute_reply": "2026-02-02T13:00:47.358688Z"
|
|
664
|
+
},
|
|
665
|
+
"papermill": {
|
|
666
|
+
"duration": 0.049921,
|
|
667
|
+
"end_time": "2026-02-02T13:00:47.359838",
|
|
668
|
+
"exception": false,
|
|
669
|
+
"start_time": "2026-02-02T13:00:47.309917",
|
|
670
|
+
"status": "completed"
|
|
671
|
+
},
|
|
672
|
+
"tags": []
|
|
673
|
+
},
|
|
674
|
+
"outputs": [],
|
|
675
|
+
"source": [
|
|
676
|
+
"from customer_retention.stages.profiling import analyze_temporal_coverage\n",
|
|
677
|
+
"\n",
|
|
678
|
+
"df_temp = df.copy()\n",
|
|
679
|
+
"df_temp[TIME_COLUMN] = pd.to_datetime(df_temp[TIME_COLUMN])\n",
|
|
680
|
+
"\n",
|
|
681
|
+
"coverage_result = analyze_temporal_coverage(df_temp, ENTITY_COLUMN, TIME_COLUMN)\n",
|
|
682
|
+
"\n",
|
|
683
|
+
"# Events over time with gap highlighting\n",
|
|
684
|
+
"fig = go.Figure()\n",
|
|
685
|
+
"fig.add_trace(go.Scatter(\n",
|
|
686
|
+
" x=coverage_result.events_over_time.index,\n",
|
|
687
|
+
" y=coverage_result.events_over_time.values,\n",
|
|
688
|
+
" mode=\"lines\", fill=\"tozeroy\", name=\"Events\", line_color=\"steelblue\",\n",
|
|
689
|
+
"))\n",
|
|
690
|
+
"\n",
|
|
691
|
+
"for gap in coverage_result.gaps:\n",
|
|
692
|
+
" color = {\"minor\": \"rgba(255,165,0,0.15)\", \"moderate\": \"rgba(255,100,0,0.25)\",\n",
|
|
693
|
+
" \"major\": \"rgba(255,0,0,0.25)\"}[gap.severity]\n",
|
|
694
|
+
" fig.add_vrect(\n",
|
|
695
|
+
" x0=gap.start, x1=gap.end, fillcolor=color, line_width=0,\n",
|
|
696
|
+
" annotation_text=f\"{gap.duration_days:.0f}d gap\",\n",
|
|
697
|
+
" annotation_position=\"top left\", annotation_font_size=10,\n",
|
|
698
|
+
" )\n",
|
|
699
|
+
"\n",
|
|
700
|
+
"trend_label = f\"{coverage_result.volume_trend} ({coverage_result.volume_change_pct:+.0%})\"\n",
|
|
701
|
+
"fig.update_layout(\n",
|
|
702
|
+
" title=f\"Event Volume Over Time<br><sub>Trend: {trend_label}\"\n",
|
|
703
|
+
" + (f\" | {len(coverage_result.gaps)} gap(s) highlighted\" if coverage_result.gaps else \"\")\n",
|
|
704
|
+
" + \"</sub>\",\n",
|
|
705
|
+
" xaxis_title=\"Date\", yaxis_title=\"Events per Period\",\n",
|
|
706
|
+
" template=\"plotly_white\", height=380,\n",
|
|
707
|
+
")\n",
|
|
708
|
+
"display_figure(fig)"
|
|
709
|
+
]
|
|
710
|
+
},
|
|
711
|
+
{
|
|
712
|
+
"cell_type": "code",
|
|
713
|
+
"execution_count": null,
|
|
714
|
+
"id": "0434f9b0",
|
|
715
|
+
"metadata": {
|
|
716
|
+
"execution": {
|
|
717
|
+
"iopub.execute_input": "2026-02-02T13:00:47.378950Z",
|
|
718
|
+
"iopub.status.busy": "2026-02-02T13:00:47.378814Z",
|
|
719
|
+
"iopub.status.idle": "2026-02-02T13:00:47.398377Z",
|
|
720
|
+
"shell.execute_reply": "2026-02-02T13:00:47.397715Z"
|
|
721
|
+
},
|
|
722
|
+
"papermill": {
|
|
723
|
+
"duration": 0.029693,
|
|
724
|
+
"end_time": "2026-02-02T13:00:47.399039",
|
|
725
|
+
"exception": false,
|
|
726
|
+
"start_time": "2026-02-02T13:00:47.369346",
|
|
727
|
+
"status": "completed"
|
|
728
|
+
},
|
|
729
|
+
"tags": []
|
|
730
|
+
},
|
|
731
|
+
"outputs": [],
|
|
732
|
+
"source": [
|
|
733
|
+
"fig = make_subplots(\n",
|
|
734
|
+
" rows=1, cols=2, subplot_titles=[\"New Entities Over Time\", \"Entity Coverage by Window\"],\n",
|
|
735
|
+
" column_widths=[0.6, 0.4], horizontal_spacing=0.12,\n",
|
|
736
|
+
")\n",
|
|
737
|
+
"\n",
|
|
738
|
+
"# Left: new entities over time\n",
|
|
739
|
+
"fig.add_trace(go.Bar(\n",
|
|
740
|
+
" x=coverage_result.new_entities_over_time.index,\n",
|
|
741
|
+
" y=coverage_result.new_entities_over_time.values,\n",
|
|
742
|
+
" marker_color=\"mediumseagreen\", opacity=0.8, showlegend=False,\n",
|
|
743
|
+
"), row=1, col=1)\n",
|
|
744
|
+
"fig.update_xaxes(title_text=\"First Event Date\", row=1, col=1)\n",
|
|
745
|
+
"fig.update_yaxes(title_text=\"New Entities\", row=1, col=1)\n",
|
|
746
|
+
"\n",
|
|
747
|
+
"# Right: entity window coverage bar chart\n",
|
|
748
|
+
"cov_data = [(c.window, c.coverage_pct, c.active_entities) for c in coverage_result.entity_window_coverage]\n",
|
|
749
|
+
"windows_labels = [c[0] for c in cov_data]\n",
|
|
750
|
+
"coverage_pcts = [c[1] * 100 for c in cov_data]\n",
|
|
751
|
+
"active_counts = [c[2] for c in cov_data]\n",
|
|
752
|
+
"\n",
|
|
753
|
+
"bar_colors = [\"#2ca02c\" if p >= 50 else \"#ff7f0e\" if p >= 10 else \"#d62728\" for p in coverage_pcts]\n",
|
|
754
|
+
"fig.add_trace(go.Bar(\n",
|
|
755
|
+
" x=windows_labels, y=coverage_pcts, showlegend=False,\n",
|
|
756
|
+
" marker_color=bar_colors, opacity=0.85,\n",
|
|
757
|
+
" text=[f\"{p:.0f}%<br>({n:,})\" for p, n in zip(coverage_pcts, active_counts)],\n",
|
|
758
|
+
" textposition=\"outside\", textfont_size=9,\n",
|
|
759
|
+
"), row=1, col=2)\n",
|
|
760
|
+
"fig.update_xaxes(title_text=\"Window\", row=1, col=2)\n",
|
|
761
|
+
"fig.update_yaxes(title_text=\"% Entities Active\", range=[0, 115], row=1, col=2)\n",
|
|
762
|
+
"\n",
|
|
763
|
+
"fig.update_layout(\n",
|
|
764
|
+
" template=\"plotly_white\", height=380,\n",
|
|
765
|
+
" title=\"Entity Arrival & Window Coverage\"\n",
|
|
766
|
+
" + f\"<br><sub>Reference date: {coverage_result.last_event.strftime('%Y-%m-%d')}</sub>\",\n",
|
|
767
|
+
" margin=dict(b=50),\n",
|
|
768
|
+
")\n",
|
|
769
|
+
"display_figure(fig)"
|
|
770
|
+
]
|
|
771
|
+
},
|
|
772
|
+
{
|
|
773
|
+
"cell_type": "code",
|
|
774
|
+
"execution_count": null,
|
|
775
|
+
"id": "236521ff",
|
|
776
|
+
"metadata": {
|
|
777
|
+
"execution": {
|
|
778
|
+
"iopub.execute_input": "2026-02-02T13:00:47.420663Z",
|
|
779
|
+
"iopub.status.busy": "2026-02-02T13:00:47.420508Z",
|
|
780
|
+
"iopub.status.idle": "2026-02-02T13:00:47.423477Z",
|
|
781
|
+
"shell.execute_reply": "2026-02-02T13:00:47.422897Z"
|
|
782
|
+
},
|
|
783
|
+
"papermill": {
|
|
784
|
+
"duration": 0.01449,
|
|
785
|
+
"end_time": "2026-02-02T13:00:47.424096",
|
|
786
|
+
"exception": false,
|
|
787
|
+
"start_time": "2026-02-02T13:00:47.409606",
|
|
788
|
+
"status": "completed"
|
|
789
|
+
},
|
|
790
|
+
"tags": []
|
|
791
|
+
},
|
|
792
|
+
"outputs": [],
|
|
793
|
+
"source": [
|
|
794
|
+
"print(f\"Coverage Summary:\")\n",
|
|
795
|
+
"print(f\" Time span: {coverage_result.time_span_days:,} days \"\n",
|
|
796
|
+
" f\"({coverage_result.first_event.strftime('%Y-%m-%d')} to {coverage_result.last_event.strftime('%Y-%m-%d')})\")\n",
|
|
797
|
+
"print(f\" Volume trend: {coverage_result.volume_trend} ({coverage_result.volume_change_pct:+.0%})\")\n",
|
|
798
|
+
"print(f\" Data gaps: {len(coverage_result.gaps)} detected\"\n",
|
|
799
|
+
" + (f\" ({sum(g.duration_days for g in coverage_result.gaps):.0f} total days)\" if coverage_result.gaps else \"\"))\n",
|
|
800
|
+
"\n",
|
|
801
|
+
"if coverage_result.recommendations:\n",
|
|
802
|
+
" print(f\"\\nRecommendations:\")\n",
|
|
803
|
+
" for rec in coverage_result.recommendations:\n",
|
|
804
|
+
" print(f\" -> {rec}\")\n",
|
|
805
|
+
"else:\n",
|
|
806
|
+
" print(f\"\\nNo coverage issues detected — data is suitable for all candidate windows.\")"
|
|
807
|
+
]
|
|
808
|
+
},
|
|
809
|
+
{
|
|
810
|
+
"cell_type": "code",
|
|
811
|
+
"execution_count": null,
|
|
812
|
+
"id": "a66b47f7",
|
|
813
|
+
"metadata": {
|
|
814
|
+
"execution": {
|
|
815
|
+
"iopub.execute_input": "2026-02-02T13:00:47.444695Z",
|
|
816
|
+
"iopub.status.busy": "2026-02-02T13:00:47.444576Z",
|
|
817
|
+
"iopub.status.idle": "2026-02-02T13:00:47.447904Z",
|
|
818
|
+
"shell.execute_reply": "2026-02-02T13:00:47.447448Z"
|
|
819
|
+
},
|
|
820
|
+
"papermill": {
|
|
821
|
+
"duration": 0.014214,
|
|
822
|
+
"end_time": "2026-02-02T13:00:47.448412",
|
|
823
|
+
"exception": false,
|
|
824
|
+
"start_time": "2026-02-02T13:00:47.434198",
|
|
825
|
+
"status": "completed"
|
|
826
|
+
},
|
|
827
|
+
"tags": []
|
|
828
|
+
},
|
|
829
|
+
"outputs": [],
|
|
830
|
+
"source": [
|
|
831
|
+
"from customer_retention.stages.profiling import derive_drift_implications\n",
|
|
832
|
+
"\n",
|
|
833
|
+
"drift = derive_drift_implications(coverage_result)\n",
|
|
834
|
+
"\n",
|
|
835
|
+
"risk_colors = {\"low\": \"\\033[92m\", \"moderate\": \"\\033[93m\", \"high\": \"\\033[91m\"}\n",
|
|
836
|
+
"reset = \"\\033[0m\"\n",
|
|
837
|
+
"color = risk_colors.get(drift.risk_level, \"\")\n",
|
|
838
|
+
"\n",
|
|
839
|
+
"print(f\"Parameter Drift Assessment: {color}{drift.risk_level.upper()}{reset}\")\n",
|
|
840
|
+
"print(f\" Volume drift risk: {drift.volume_drift_risk}\")\n",
|
|
841
|
+
"print(f\" Population stability: {drift.population_stability:.2f}\")\n",
|
|
842
|
+
"print(f\" Data regimes: {drift.regime_count}\")\n",
|
|
843
|
+
"if drift.recommended_training_start:\n",
|
|
844
|
+
" print(f\" Recommended training start: {drift.recommended_training_start.strftime('%Y-%m-%d')}\")\n",
|
|
845
|
+
"\n",
|
|
846
|
+
"print(f\"\\nRationale:\")\n",
|
|
847
|
+
"for r in drift.rationale:\n",
|
|
848
|
+
" print(f\" -> {r}\")"
|
|
849
|
+
]
|
|
850
|
+
},
|
|
851
|
+
{
|
|
852
|
+
"cell_type": "markdown",
|
|
853
|
+
"id": "5fe41a6c",
|
|
854
|
+
"metadata": {
|
|
855
|
+
"papermill": {
|
|
856
|
+
"duration": 0.010352,
|
|
857
|
+
"end_time": "2026-02-02T13:00:47.468456",
|
|
858
|
+
"exception": false,
|
|
859
|
+
"start_time": "2026-02-02T13:00:47.458104",
|
|
860
|
+
"status": "completed"
|
|
861
|
+
},
|
|
862
|
+
"tags": []
|
|
863
|
+
},
|
|
864
|
+
"source": [
|
|
865
|
+
"## 1a.7 Inter-Event Timing Analysis\n",
|
|
866
|
+
"\n",
|
|
867
|
+
"**📖 Understanding Inter-Event Time:**\n",
|
|
868
|
+
"- Time between consecutive events for each entity\n",
|
|
869
|
+
"- Short inter-event time: Frequent engagement\n",
|
|
870
|
+
"- Long inter-event time: Sporadic usage or churn risk"
|
|
871
|
+
]
|
|
872
|
+
},
|
|
873
|
+
{
|
|
874
|
+
"cell_type": "code",
|
|
875
|
+
"execution_count": null,
|
|
876
|
+
"id": "7fd6154f",
|
|
877
|
+
"metadata": {
|
|
878
|
+
"execution": {
|
|
879
|
+
"iopub.execute_input": "2026-02-02T13:00:47.488399Z",
|
|
880
|
+
"iopub.status.busy": "2026-02-02T13:00:47.488285Z",
|
|
881
|
+
"iopub.status.idle": "2026-02-02T13:00:48.452937Z",
|
|
882
|
+
"shell.execute_reply": "2026-02-02T13:00:48.452317Z"
|
|
883
|
+
},
|
|
884
|
+
"papermill": {
|
|
885
|
+
"duration": 0.976266,
|
|
886
|
+
"end_time": "2026-02-02T13:00:48.454313",
|
|
887
|
+
"exception": false,
|
|
888
|
+
"start_time": "2026-02-02T13:00:47.478047",
|
|
889
|
+
"status": "completed"
|
|
890
|
+
},
|
|
891
|
+
"tags": []
|
|
892
|
+
},
|
|
893
|
+
"outputs": [],
|
|
894
|
+
"source": [
|
|
895
|
+
"# Compute inter-event times for all entities with >1 event\n",
|
|
896
|
+
"inter_event_times = []\n",
|
|
897
|
+
"\n",
|
|
898
|
+
"for entity, group in df_temp.groupby(ENTITY_COLUMN):\n",
|
|
899
|
+
" if len(group) < 2:\n",
|
|
900
|
+
" continue\n",
|
|
901
|
+
" sorted_times = group[TIME_COLUMN].sort_values()\n",
|
|
902
|
+
" diffs = sorted_times.diff().dropna()\n",
|
|
903
|
+
" inter_event_times.extend(diffs.dt.total_seconds() / 86400) # Convert to days\n",
|
|
904
|
+
"\n",
|
|
905
|
+
"if inter_event_times:\n",
|
|
906
|
+
" inter_event_series = pd.Series(inter_event_times)\n",
|
|
907
|
+
" \n",
|
|
908
|
+
" print(\"\\u23f1\\ufe0f Inter-Event Time Distribution (days):\")\n",
|
|
909
|
+
" print(f\" Min: {inter_event_series.min():.2f}\")\n",
|
|
910
|
+
" print(f\" 25th percentile: {inter_event_series.quantile(0.25):.2f}\")\n",
|
|
911
|
+
" print(f\" Median: {inter_event_series.median():.2f}\")\n",
|
|
912
|
+
" print(f\" Mean: {inter_event_series.mean():.2f}\")\n",
|
|
913
|
+
" print(f\" 75th percentile: {inter_event_series.quantile(0.75):.2f}\")\n",
|
|
914
|
+
" print(f\" Max: {inter_event_series.max():.2f}\")\n",
|
|
915
|
+
" \n",
|
|
916
|
+
" # Histogram\n",
|
|
917
|
+
" fig = go.Figure()\n",
|
|
918
|
+
" \n",
|
|
919
|
+
" # Cap at 99th percentile for visualization\n",
|
|
920
|
+
" cap = inter_event_series.quantile(0.99)\n",
|
|
921
|
+
" display_data = inter_event_series[inter_event_series <= cap]\n",
|
|
922
|
+
" \n",
|
|
923
|
+
" fig.add_trace(go.Histogram(\n",
|
|
924
|
+
" x=display_data,\n",
|
|
925
|
+
" nbinsx=50,\n",
|
|
926
|
+
" name=\"Inter-Event Time\",\n",
|
|
927
|
+
" marker_color=\"coral\",\n",
|
|
928
|
+
" opacity=0.7\n",
|
|
929
|
+
" ))\n",
|
|
930
|
+
" \n",
|
|
931
|
+
" fig.add_vline(x=inter_event_series.median(), line_dash=\"solid\", line_color=\"green\",\n",
|
|
932
|
+
" annotation_text=f\"Median: {inter_event_series.median():.1f} days\",\n",
|
|
933
|
+
" annotation_position=\"top right\")\n",
|
|
934
|
+
" \n",
|
|
935
|
+
" fig.update_layout(\n",
|
|
936
|
+
" title=f\"Inter-Event Time Distribution (capped at {cap:.0f} days = 99th percentile)\",\n",
|
|
937
|
+
" xaxis_title=\"Days Between Events\",\n",
|
|
938
|
+
" yaxis_title=\"Frequency\",\n",
|
|
939
|
+
" template=\"plotly_white\",\n",
|
|
940
|
+
" height=400\n",
|
|
941
|
+
" )\n",
|
|
942
|
+
" display_figure(fig)\n",
|
|
943
|
+
"else:\n",
|
|
944
|
+
" print(\"Not enough multi-event entities to analyze inter-event timing\")"
|
|
945
|
+
]
|
|
946
|
+
},
|
|
947
|
+
{
|
|
948
|
+
"cell_type": "code",
|
|
949
|
+
"execution_count": null,
|
|
950
|
+
"id": "9ff4ef83",
|
|
951
|
+
"metadata": {
|
|
952
|
+
"execution": {
|
|
953
|
+
"iopub.execute_input": "2026-02-02T13:00:48.495793Z",
|
|
954
|
+
"iopub.status.busy": "2026-02-02T13:00:48.495373Z",
|
|
955
|
+
"iopub.status.idle": "2026-02-02T13:00:48.505291Z",
|
|
956
|
+
"shell.execute_reply": "2026-02-02T13:00:48.504775Z"
|
|
957
|
+
},
|
|
958
|
+
"papermill": {
|
|
959
|
+
"duration": 0.037445,
|
|
960
|
+
"end_time": "2026-02-02T13:00:48.505957",
|
|
961
|
+
"exception": false,
|
|
962
|
+
"start_time": "2026-02-02T13:00:48.468512",
|
|
963
|
+
"status": "completed"
|
|
964
|
+
},
|
|
965
|
+
"tags": []
|
|
966
|
+
},
|
|
967
|
+
"outputs": [],
|
|
968
|
+
"source": [
|
|
969
|
+
"if inter_event_times:\n",
|
|
970
|
+
" median_iet = inter_event_series.median()\n",
|
|
971
|
+
" mean_iet = inter_event_series.mean()\n",
|
|
972
|
+
" q25 = inter_event_series.quantile(0.25)\n",
|
|
973
|
+
" q75 = inter_event_series.quantile(0.75)\n",
|
|
974
|
+
" iqr = q75 - q25\n",
|
|
975
|
+
" skew_ratio = mean_iet / median_iet if median_iet > 0 else 1.0\n",
|
|
976
|
+
"\n",
|
|
977
|
+
" print(\"Interpretation:\")\n",
|
|
978
|
+
" if skew_ratio > 1.5:\n",
|
|
979
|
+
" print(f\" Distribution is heavily right-skewed (mean/median = {skew_ratio:.2f})\")\n",
|
|
980
|
+
" print(f\" -> Most entities engage frequently (median {median_iet:.0f}d between events)\")\n",
|
|
981
|
+
" print(f\" -> A long tail of entities has very infrequent engagement\")\n",
|
|
982
|
+
" elif skew_ratio > 1.2:\n",
|
|
983
|
+
" print(f\" Distribution is moderately right-skewed (mean/median = {skew_ratio:.2f})\")\n",
|
|
984
|
+
" print(f\" -> Typical engagement every {median_iet:.0f} days, with some long gaps\")\n",
|
|
985
|
+
" else:\n",
|
|
986
|
+
" print(f\" Distribution is approximately symmetric (mean/median = {skew_ratio:.2f})\")\n",
|
|
987
|
+
" print(f\" -> Consistent engagement pattern around {median_iet:.0f} days\")\n",
|
|
988
|
+
"\n",
|
|
989
|
+
" print(f\"\\n Spread: IQR = {iqr:.0f} days (Q25={q25:.0f}d to Q75={q75:.0f}d)\")\n",
|
|
990
|
+
" if iqr > median_iet:\n",
|
|
991
|
+
" print(f\" -> High variability (IQR > median) — entities have inconsistent timing\")\n",
|
|
992
|
+
" else:\n",
|
|
993
|
+
" print(f\" -> Moderate variability — most entities follow a similar cadence\")\n",
|
|
994
|
+
"\n",
|
|
995
|
+
" print(f\"\\nRecommendations:\")\n",
|
|
996
|
+
" # Window alignment\n",
|
|
997
|
+
" window_map = [(1, \"24h\"), (7, \"7d\"), (14, \"14d\"), (30, \"30d\"),\n",
|
|
998
|
+
" (90, \"90d\"), (180, \"180d\"), (365, \"365d\")]\n",
|
|
999
|
+
" aligned = [(d, w) for d, w in window_map if 0.5 * median_iet <= d <= 2 * median_iet]\n",
|
|
1000
|
+
" if aligned:\n",
|
|
1001
|
+
" aligned_str = \", \".join(w for _, w in aligned)\n",
|
|
1002
|
+
" print(f\" -> Windows aligned with median inter-event time: {aligned_str}\")\n",
|
|
1003
|
+
" print(f\" These capture ~2 events per entity on average\")\n",
|
|
1004
|
+
" else:\n",
|
|
1005
|
+
" print(f\" -> Median inter-event ({median_iet:.0f}d) does not align with standard windows\")\n",
|
|
1006
|
+
"\n",
|
|
1007
|
+
" events_in_30d = 30.0 / median_iet if median_iet > 0 else 0\n",
|
|
1008
|
+
" events_in_90d = 90.0 / median_iet if median_iet > 0 else 0\n",
|
|
1009
|
+
" if events_in_30d < 2:\n",
|
|
1010
|
+
" print(f\" -> 30d window captures only ~{events_in_30d:.1f} events/entity — \"\n",
|
|
1011
|
+
" f\"consider longer windows (90d+) for meaningful aggregations\")\n",
|
|
1012
|
+
" if median_iet < 7:\n",
|
|
1013
|
+
" print(f\" -> High frequency engagement — 7d and 24h windows will be rich with signal\")\n",
|
|
1014
|
+
"\n",
|
|
1015
|
+
" if skew_ratio > 1.5:\n",
|
|
1016
|
+
" print(f\" -> Consider log-transforming inter-event time as a feature \"\n",
|
|
1017
|
+
" f\"(reduces right-skew impact on models)\")\n"
|
|
1018
|
+
]
|
|
1019
|
+
},
|
|
1020
|
+
{
|
|
1021
|
+
"cell_type": "markdown",
|
|
1022
|
+
"id": "a8736668",
|
|
1023
|
+
"metadata": {
|
|
1024
|
+
"papermill": {
|
|
1025
|
+
"duration": 0.015171,
|
|
1026
|
+
"end_time": "2026-02-02T13:00:48.535261",
|
|
1027
|
+
"exception": false,
|
|
1028
|
+
"start_time": "2026-02-02T13:00:48.520090",
|
|
1029
|
+
"status": "completed"
|
|
1030
|
+
},
|
|
1031
|
+
"tags": []
|
|
1032
|
+
},
|
|
1033
|
+
"source": [
|
|
1034
|
+
"## 1a.8 Column Distributions\n",
|
|
1035
|
+
"\n",
|
|
1036
|
+
"Standard column profiling applied to event-level data - distributions, outliers, transformation needs."
|
|
1037
|
+
]
|
|
1038
|
+
},
|
|
1039
|
+
{
|
|
1040
|
+
"cell_type": "code",
|
|
1041
|
+
"execution_count": null,
|
|
1042
|
+
"id": "83644d1f",
|
|
1043
|
+
"metadata": {
|
|
1044
|
+
"execution": {
|
|
1045
|
+
"iopub.execute_input": "2026-02-02T13:00:48.564797Z",
|
|
1046
|
+
"iopub.status.busy": "2026-02-02T13:00:48.564671Z",
|
|
1047
|
+
"iopub.status.idle": "2026-02-02T13:00:48.583852Z",
|
|
1048
|
+
"shell.execute_reply": "2026-02-02T13:00:48.583026Z"
|
|
1049
|
+
},
|
|
1050
|
+
"papermill": {
|
|
1051
|
+
"duration": 0.035159,
|
|
1052
|
+
"end_time": "2026-02-02T13:00:48.584649",
|
|
1053
|
+
"exception": false,
|
|
1054
|
+
"start_time": "2026-02-02T13:00:48.549490",
|
|
1055
|
+
"status": "completed"
|
|
1056
|
+
},
|
|
1057
|
+
"tags": []
|
|
1058
|
+
},
|
|
1059
|
+
"outputs": [],
|
|
1060
|
+
"source": [
|
|
1061
|
+
"# Use framework's DistributionAnalyzer for comprehensive analysis\n",
|
|
1062
|
+
"analyzer = DistributionAnalyzer()\n",
|
|
1063
|
+
"\n",
|
|
1064
|
+
"numeric_cols = [n for n, c in findings.columns.items() \n",
|
|
1065
|
+
" if c.inferred_type.value in ('numeric_continuous', 'numeric_discrete')\n",
|
|
1066
|
+
" and n not in [ENTITY_COLUMN, TIME_COLUMN] and n not in TEMPORAL_METADATA_COLS]\n",
|
|
1067
|
+
"\n",
|
|
1068
|
+
"# Analyze all numeric columns using the framework\n",
|
|
1069
|
+
"analyses = analyzer.analyze_dataframe(df, numeric_cols)\n",
|
|
1070
|
+
"recommendations = {col: analyzer.recommend_transformation(analysis) \n",
|
|
1071
|
+
" for col, analysis in analyses.items()}\n",
|
|
1072
|
+
"\n",
|
|
1073
|
+
"# Human-readable transformation names\n",
|
|
1074
|
+
"TRANSFORM_DISPLAY_NAMES = {\n",
|
|
1075
|
+
" 'none': 'None needed',\n",
|
|
1076
|
+
" 'log': 'Log transform',\n",
|
|
1077
|
+
" 'log1p': 'Log(1+x) transform',\n",
|
|
1078
|
+
" 'sqrt': 'Square root',\n",
|
|
1079
|
+
" 'box_cox': 'Box-Cox power transform',\n",
|
|
1080
|
+
" 'yeo_johnson': 'Yeo-Johnson power transform',\n",
|
|
1081
|
+
" 'quantile': 'Quantile normalization',\n",
|
|
1082
|
+
" 'robust_scale': 'Robust scaling (median/IQR)',\n",
|
|
1083
|
+
" 'standard_scale': 'Standard scaling (z-score)',\n",
|
|
1084
|
+
" 'minmax_scale': 'Min-Max scaling',\n",
|
|
1085
|
+
"}\n",
|
|
1086
|
+
"\n",
|
|
1087
|
+
"print(\"=\"*70)\n",
|
|
1088
|
+
"print(\"NUMERIC COLUMN PROFILES\")\n",
|
|
1089
|
+
"print(\"=\"*70)\n",
|
|
1090
|
+
"\n",
|
|
1091
|
+
"for col_name in numeric_cols:\n",
|
|
1092
|
+
" col_info = findings.columns[col_name]\n",
|
|
1093
|
+
" analysis = analyses.get(col_name)\n",
|
|
1094
|
+
" rec = recommendations.get(col_name)\n",
|
|
1095
|
+
" \n",
|
|
1096
|
+
" print(f\"\\n{'='*70}\")\n",
|
|
1097
|
+
" print(f\"Column: {col_name}\")\n",
|
|
1098
|
+
" print(f\"Type: {col_info.inferred_type.value} (Confidence: {col_info.confidence:.0%})\")\n",
|
|
1099
|
+
" print(f\"-\" * 70)\n",
|
|
1100
|
+
" \n",
|
|
1101
|
+
" if analysis:\n",
|
|
1102
|
+
" print(f\"📊 Distribution Statistics:\")\n",
|
|
1103
|
+
" print(f\" Mean: {analysis.mean:.3f} | Median: {analysis.median:.3f} | Std: {analysis.std:.3f}\")\n",
|
|
1104
|
+
" print(f\" Range: [{analysis.min_value:.3f}, {analysis.max_value:.3f}]\")\n",
|
|
1105
|
+
" print(f\" Percentiles: 1%={analysis.percentiles['p1']:.3f}, 25%={analysis.q1:.3f}, 75%={analysis.q3:.3f}, 99%={analysis.percentiles['p99']:.3f}\")\n",
|
|
1106
|
+
" print(f\"\\n📈 Shape Analysis:\")\n",
|
|
1107
|
+
" skew_label = '(Right-skewed)' if analysis.skewness > 0.5 else '(Left-skewed)' if analysis.skewness < -0.5 else '(Symmetric)'\n",
|
|
1108
|
+
" print(f\" Skewness: {analysis.skewness:.2f} {skew_label}\")\n",
|
|
1109
|
+
" kurt_label = '(Heavy tails/outliers)' if analysis.kurtosis > 3 else '(Light tails)'\n",
|
|
1110
|
+
" print(f\" Kurtosis: {analysis.kurtosis:.2f} {kurt_label}\")\n",
|
|
1111
|
+
" print(f\" Zeros: {analysis.zero_count:,} ({analysis.zero_percentage:.1f}%)\")\n",
|
|
1112
|
+
" print(f\" Outliers (IQR): {analysis.outlier_count_iqr:,} ({analysis.outlier_percentage:.1f}%)\")\n",
|
|
1113
|
+
" \n",
|
|
1114
|
+
" if rec:\n",
|
|
1115
|
+
" transform_display = TRANSFORM_DISPLAY_NAMES.get(rec.recommended_transform.value, rec.recommended_transform.value)\n",
|
|
1116
|
+
" print(f\"\\n🔧 Recommended Transformation: {transform_display}\")\n",
|
|
1117
|
+
" print(f\" Reason: {rec.reason}\")\n",
|
|
1118
|
+
" print(f\" Priority: {rec.priority}\")\n",
|
|
1119
|
+
" if rec.warnings:\n",
|
|
1120
|
+
" for warn in rec.warnings:\n",
|
|
1121
|
+
" print(f\" ⚠️ {warn}\")"
|
|
1122
|
+
]
|
|
1123
|
+
},
|
|
1124
|
+
{
|
|
1125
|
+
"cell_type": "code",
|
|
1126
|
+
"execution_count": null,
|
|
1127
|
+
"id": "394f68c4",
|
|
1128
|
+
"metadata": {
|
|
1129
|
+
"execution": {
|
|
1130
|
+
"iopub.execute_input": "2026-02-02T13:00:48.614739Z",
|
|
1131
|
+
"iopub.status.busy": "2026-02-02T13:00:48.614603Z",
|
|
1132
|
+
"iopub.status.idle": "2026-02-02T13:00:48.651049Z",
|
|
1133
|
+
"shell.execute_reply": "2026-02-02T13:00:48.650385Z"
|
|
1134
|
+
},
|
|
1135
|
+
"papermill": {
|
|
1136
|
+
"duration": 0.052606,
|
|
1137
|
+
"end_time": "2026-02-02T13:00:48.651826",
|
|
1138
|
+
"exception": false,
|
|
1139
|
+
"start_time": "2026-02-02T13:00:48.599220",
|
|
1140
|
+
"status": "completed"
|
|
1141
|
+
},
|
|
1142
|
+
"tags": []
|
|
1143
|
+
},
|
|
1144
|
+
"outputs": [],
|
|
1145
|
+
"source": [
|
|
1146
|
+
"# Per-column distribution visualizations with transformation recommendations\n",
|
|
1147
|
+
"for col_name in numeric_cols:\n",
|
|
1148
|
+
" analysis = analyses.get(col_name)\n",
|
|
1149
|
+
" rec = recommendations.get(col_name)\n",
|
|
1150
|
+
" if not analysis:\n",
|
|
1151
|
+
" continue\n",
|
|
1152
|
+
" \n",
|
|
1153
|
+
" data = df[col_name].dropna()\n",
|
|
1154
|
+
" fig = go.Figure()\n",
|
|
1155
|
+
" \n",
|
|
1156
|
+
" fig.add_trace(go.Histogram(x=data, nbinsx=50, name='Distribution',\n",
|
|
1157
|
+
" marker_color='steelblue', opacity=0.7))\n",
|
|
1158
|
+
" \n",
|
|
1159
|
+
" mean_val = data.mean()\n",
|
|
1160
|
+
" median_val = data.median()\n",
|
|
1161
|
+
" \n",
|
|
1162
|
+
" # Position labels on opposite sides to avoid overlap\n",
|
|
1163
|
+
" mean_position = \"top right\" if mean_val >= median_val else \"top left\"\n",
|
|
1164
|
+
" median_position = \"top left\" if mean_val >= median_val else \"top right\"\n",
|
|
1165
|
+
" \n",
|
|
1166
|
+
" fig.add_vline(\n",
|
|
1167
|
+
" x=mean_val, line_dash=\"dash\", line_color=\"red\",\n",
|
|
1168
|
+
" annotation_text=f\"Mean: {mean_val:.2f}\",\n",
|
|
1169
|
+
" annotation_position=mean_position,\n",
|
|
1170
|
+
" annotation_font_color=\"red\",\n",
|
|
1171
|
+
" annotation_bgcolor=\"rgba(255,255,255,0.8)\"\n",
|
|
1172
|
+
" )\n",
|
|
1173
|
+
" \n",
|
|
1174
|
+
" fig.add_vline(\n",
|
|
1175
|
+
" x=median_val, line_dash=\"solid\", line_color=\"green\",\n",
|
|
1176
|
+
" annotation_text=f\"Median: {median_val:.2f}\",\n",
|
|
1177
|
+
" annotation_position=median_position,\n",
|
|
1178
|
+
" annotation_font_color=\"green\",\n",
|
|
1179
|
+
" annotation_bgcolor=\"rgba(255,255,255,0.8)\"\n",
|
|
1180
|
+
" )\n",
|
|
1181
|
+
" \n",
|
|
1182
|
+
" # Add 99th percentile marker if there are outliers\n",
|
|
1183
|
+
" if analysis.outlier_percentage > 5:\n",
|
|
1184
|
+
" fig.add_vline(x=analysis.percentiles['p99'], line_dash=\"dot\", line_color=\"orange\",\n",
|
|
1185
|
+
" annotation_text=f\"99th: {analysis.percentiles['p99']:.2f}\",\n",
|
|
1186
|
+
" annotation_position=\"top right\",\n",
|
|
1187
|
+
" annotation_font_color=\"orange\",\n",
|
|
1188
|
+
" annotation_bgcolor=\"rgba(255,255,255,0.8)\")\n",
|
|
1189
|
+
" \n",
|
|
1190
|
+
" transform_key = rec.recommended_transform.value if rec else \"none\"\n",
|
|
1191
|
+
" transform_label = TRANSFORM_DISPLAY_NAMES.get(transform_key, transform_key)\n",
|
|
1192
|
+
" fig.update_layout(\n",
|
|
1193
|
+
" title=f\"Distribution: {col_name}<br><sub>Skew: {analysis.skewness:.2f} | Kurt: {analysis.kurtosis:.2f} | Strategy: {transform_label}</sub>\",\n",
|
|
1194
|
+
" xaxis_title=col_name,\n",
|
|
1195
|
+
" yaxis_title=\"Count\",\n",
|
|
1196
|
+
" template='plotly_white',\n",
|
|
1197
|
+
" height=400\n",
|
|
1198
|
+
" )\n",
|
|
1199
|
+
" display_figure(fig)"
|
|
1200
|
+
]
|
|
1201
|
+
},
|
|
1202
|
+
{
|
|
1203
|
+
"cell_type": "code",
|
|
1204
|
+
"execution_count": null,
|
|
1205
|
+
"id": "717daeed",
|
|
1206
|
+
"metadata": {
|
|
1207
|
+
"execution": {
|
|
1208
|
+
"iopub.execute_input": "2026-02-02T13:00:48.690400Z",
|
|
1209
|
+
"iopub.status.busy": "2026-02-02T13:00:48.690274Z",
|
|
1210
|
+
"iopub.status.idle": "2026-02-02T13:00:48.769460Z",
|
|
1211
|
+
"shell.execute_reply": "2026-02-02T13:00:48.768110Z"
|
|
1212
|
+
},
|
|
1213
|
+
"papermill": {
|
|
1214
|
+
"duration": 0.100477,
|
|
1215
|
+
"end_time": "2026-02-02T13:00:48.770667",
|
|
1216
|
+
"exception": false,
|
|
1217
|
+
"start_time": "2026-02-02T13:00:48.670190",
|
|
1218
|
+
"status": "completed"
|
|
1219
|
+
},
|
|
1220
|
+
"tags": []
|
|
1221
|
+
},
|
|
1222
|
+
"outputs": [],
|
|
1223
|
+
"source": [
|
|
1224
|
+
"print(\"\\n\" + \"=\"*70)\n",
|
|
1225
|
+
"print(\"CATEGORICAL COLUMN PROFILES\")\n",
|
|
1226
|
+
"print(\"=\"*70)\n",
|
|
1227
|
+
"\n",
|
|
1228
|
+
"categorical_cols = [n for n, c in findings.columns.items()\n",
|
|
1229
|
+
" if c.inferred_type.value in ('categorical_nominal', 'categorical_ordinal', 'binary', 'categorical_cyclical')\n",
|
|
1230
|
+
" and c.inferred_type != ColumnType.TEXT # TEXT columns processed separately in 01a_a\n",
|
|
1231
|
+
" and n not in [ENTITY_COLUMN, TIME_COLUMN] and n not in TEMPORAL_METADATA_COLS]\n",
|
|
1232
|
+
"\n",
|
|
1233
|
+
"for col_name in categorical_cols:\n",
|
|
1234
|
+
" col_info = findings.columns[col_name]\n",
|
|
1235
|
+
" cardinality = col_info.universal_metrics.get('distinct_count', df[col_name].nunique())\n",
|
|
1236
|
+
" \n",
|
|
1237
|
+
" print(f\"\\n{'='*50}\")\n",
|
|
1238
|
+
" print(f\"Column: {col_name}\")\n",
|
|
1239
|
+
" print(f\"Type: {col_info.inferred_type.value} (Confidence: {col_info.confidence:.0%})\")\n",
|
|
1240
|
+
" print(f\"Distinct Values: {cardinality}\")\n",
|
|
1241
|
+
" \n",
|
|
1242
|
+
" # Encoding recommendation based on type and cardinality\n",
|
|
1243
|
+
" if col_info.inferred_type.value == 'categorical_cyclical':\n",
|
|
1244
|
+
" encoding_rec = \"Sin/Cos encoding (cyclical)\"\n",
|
|
1245
|
+
" elif cardinality <= 5:\n",
|
|
1246
|
+
" encoding_rec = \"One-hot encoding (low cardinality)\"\n",
|
|
1247
|
+
" elif cardinality <= 20:\n",
|
|
1248
|
+
" encoding_rec = \"One-hot or Target encoding\"\n",
|
|
1249
|
+
" else:\n",
|
|
1250
|
+
" encoding_rec = \"Target encoding or Frequency encoding (high cardinality)\"\n",
|
|
1251
|
+
" print(f\"Recommended Encoding: {encoding_rec}\")\n",
|
|
1252
|
+
" \n",
|
|
1253
|
+
" # Value counts visualization\n",
|
|
1254
|
+
" value_counts = df[col_name].value_counts().head(10)\n",
|
|
1255
|
+
" fig = charts.bar_chart(value_counts.index.tolist(), value_counts.values.tolist(),\n",
|
|
1256
|
+
" title=f\"Top Categories: {col_name}\")\n",
|
|
1257
|
+
" display_figure(fig)"
|
|
1258
|
+
]
|
|
1259
|
+
},
|
|
1260
|
+
{
|
|
1261
|
+
"cell_type": "code",
|
|
1262
|
+
"execution_count": null,
|
|
1263
|
+
"id": "4b247183",
|
|
1264
|
+
"metadata": {
|
|
1265
|
+
"execution": {
|
|
1266
|
+
"iopub.execute_input": "2026-02-02T13:00:48.840537Z",
|
|
1267
|
+
"iopub.status.busy": "2026-02-02T13:00:48.840361Z",
|
|
1268
|
+
"iopub.status.idle": "2026-02-02T13:00:48.845994Z",
|
|
1269
|
+
"shell.execute_reply": "2026-02-02T13:00:48.844563Z"
|
|
1270
|
+
},
|
|
1271
|
+
"papermill": {
|
|
1272
|
+
"duration": 0.039378,
|
|
1273
|
+
"end_time": "2026-02-02T13:00:48.846750",
|
|
1274
|
+
"exception": false,
|
|
1275
|
+
"start_time": "2026-02-02T13:00:48.807372",
|
|
1276
|
+
"status": "completed"
|
|
1277
|
+
},
|
|
1278
|
+
"tags": []
|
|
1279
|
+
},
|
|
1280
|
+
"outputs": [],
|
|
1281
|
+
"source": [
|
|
1282
|
+
"print(\"\\n\" + \"=\"*70)\n",
|
|
1283
|
+
"print(\"TRANSFORMATION SUMMARY\")\n",
|
|
1284
|
+
"print(\"=\"*70)\n",
|
|
1285
|
+
"\n",
|
|
1286
|
+
"# Human-readable transformation names\n",
|
|
1287
|
+
"TRANSFORM_DISPLAY_NAMES = {\n",
|
|
1288
|
+
" 'none': 'None needed',\n",
|
|
1289
|
+
" 'log': 'Log transform',\n",
|
|
1290
|
+
" 'log1p': 'Log(1+x) transform',\n",
|
|
1291
|
+
" 'sqrt': 'Square root',\n",
|
|
1292
|
+
" 'box_cox': 'Box-Cox power transform',\n",
|
|
1293
|
+
" 'yeo_johnson': 'Yeo-Johnson power transform',\n",
|
|
1294
|
+
" 'quantile': 'Quantile normalization',\n",
|
|
1295
|
+
" 'robust_scale': 'Robust scaling (median/IQR)',\n",
|
|
1296
|
+
" 'standard_scale': 'Standard scaling (z-score)',\n",
|
|
1297
|
+
" 'minmax_scale': 'Min-Max scaling',\n",
|
|
1298
|
+
"}\n",
|
|
1299
|
+
"\n",
|
|
1300
|
+
"transformations = []\n",
|
|
1301
|
+
"for col_name, rec in recommendations.items():\n",
|
|
1302
|
+
" if rec and rec.recommended_transform != TransformationType.NONE:\n",
|
|
1303
|
+
" transform_key = rec.recommended_transform.value\n",
|
|
1304
|
+
" display_name = TRANSFORM_DISPLAY_NAMES.get(transform_key, transform_key)\n",
|
|
1305
|
+
" transformations.append({\n",
|
|
1306
|
+
" 'column': col_name,\n",
|
|
1307
|
+
" 'transform': display_name,\n",
|
|
1308
|
+
" 'reason': rec.reason,\n",
|
|
1309
|
+
" 'priority': rec.priority\n",
|
|
1310
|
+
" })\n",
|
|
1311
|
+
"\n",
|
|
1312
|
+
"if transformations:\n",
|
|
1313
|
+
" print(\"\\nRecommended transformations:\")\n",
|
|
1314
|
+
" # Sort by priority\n",
|
|
1315
|
+
" priority_order = {'high': 0, 'medium': 1, 'low': 2}\n",
|
|
1316
|
+
" transformations.sort(key=lambda x: priority_order.get(x['priority'], 3))\n",
|
|
1317
|
+
" \n",
|
|
1318
|
+
" for t in transformations:\n",
|
|
1319
|
+
" priority_marker = \"🔴\" if t['priority'] == 'high' else \"🟡\" if t['priority'] == 'medium' else \"🟢\"\n",
|
|
1320
|
+
" print(f\"\\n {priority_marker} {t['column']}: {t['transform']}\")\n",
|
|
1321
|
+
" print(f\" Reason: {t['reason']}\")\n",
|
|
1322
|
+
"else:\n",
|
|
1323
|
+
" print(\"\\nNo transformations needed - columns are well-behaved\")"
|
|
1324
|
+
]
|
|
1325
|
+
},
|
|
1326
|
+
{
|
|
1327
|
+
"cell_type": "code",
|
|
1328
|
+
"execution_count": null,
|
|
1329
|
+
"id": "5e4d9111",
|
|
1330
|
+
"metadata": {
|
|
1331
|
+
"execution": {
|
|
1332
|
+
"iopub.execute_input": "2026-02-02T13:00:48.903633Z",
|
|
1333
|
+
"iopub.status.busy": "2026-02-02T13:00:48.903517Z",
|
|
1334
|
+
"iopub.status.idle": "2026-02-02T13:00:48.924603Z",
|
|
1335
|
+
"shell.execute_reply": "2026-02-02T13:00:48.924101Z"
|
|
1336
|
+
},
|
|
1337
|
+
"papermill": {
|
|
1338
|
+
"duration": 0.051964,
|
|
1339
|
+
"end_time": "2026-02-02T13:00:48.925517",
|
|
1340
|
+
"exception": false,
|
|
1341
|
+
"start_time": "2026-02-02T13:00:48.873553",
|
|
1342
|
+
"status": "completed"
|
|
1343
|
+
},
|
|
1344
|
+
"tags": []
|
|
1345
|
+
},
|
|
1346
|
+
"outputs": [],
|
|
1347
|
+
"source": [
|
|
1348
|
+
"# Aggregation perspective: which windows preserve temporal signal per column?\n",
|
|
1349
|
+
"if numeric_cols and inter_event_times:\n",
|
|
1350
|
+
" median_iet = inter_event_series.median()\n",
|
|
1351
|
+
" print(\"=\"*70)\n",
|
|
1352
|
+
" print(\"TEMPORAL AGGREGATION PERSPECTIVE\")\n",
|
|
1353
|
+
" print(\"=\"*70)\n",
|
|
1354
|
+
" print(f\"\\nMedian inter-event time: {median_iet:.0f} days\")\n",
|
|
1355
|
+
" print(f\"Expected events per window (at median cadence):\")\n",
|
|
1356
|
+
" windows_days = [(\"7d\", 7), (\"30d\", 30), (\"90d\", 90), (\"180d\", 180), (\"365d\", 365)]\n",
|
|
1357
|
+
" for label, days in windows_days:\n",
|
|
1358
|
+
" expected = days / median_iet if median_iet > 0 else 0\n",
|
|
1359
|
+
" marker = \"\\u2705\" if expected >= 2 else \"\\u26a0\\ufe0f\" if expected >= 1 else \"\\u274c\"\n",
|
|
1360
|
+
" print(f\" {marker} {label}: ~{expected:.1f} events/entity\")\n",
|
|
1361
|
+
"\n",
|
|
1362
|
+
" # Within-entity vs between-entity variance per column\n",
|
|
1363
|
+
" print(f\"\\nColumn Temporal Variability (within-entity CV vs between-entity CV):\")\n",
|
|
1364
|
+
" print(f\"{'Column':<25} {'Within-CV':<12} {'Between-CV':<12} {'Ratio':<8} {'Aggregation Guidance'}\")\n",
|
|
1365
|
+
" print(\"-\" * 90)\n",
|
|
1366
|
+
"\n",
|
|
1367
|
+
" for col in numeric_cols:\n",
|
|
1368
|
+
" col_data = df_temp.groupby(ENTITY_COLUMN)[col]\n",
|
|
1369
|
+
" entity_means = col_data.mean()\n",
|
|
1370
|
+
" entity_stds = col_data.std()\n",
|
|
1371
|
+
"\n",
|
|
1372
|
+
" within_cv = (entity_stds / entity_means.abs().clip(lower=1e-10)).median()\n",
|
|
1373
|
+
" between_cv = entity_means.std() / entity_means.abs().mean() if entity_means.abs().mean() > 1e-10 else 0.0\n",
|
|
1374
|
+
"\n",
|
|
1375
|
+
" if between_cv > 0:\n",
|
|
1376
|
+
" ratio = within_cv / between_cv\n",
|
|
1377
|
+
" else:\n",
|
|
1378
|
+
" ratio = float(\"inf\") if within_cv > 0 else 0.0\n",
|
|
1379
|
+
"\n",
|
|
1380
|
+
" if within_cv < 0.3:\n",
|
|
1381
|
+
" guidance = \"Stable per entity -> all_time mean sufficient\"\n",
|
|
1382
|
+
" elif ratio > 1.5:\n",
|
|
1383
|
+
" guidance = \"High temporal dynamics -> shorter windows preserve signal\"\n",
|
|
1384
|
+
" elif ratio > 0.5:\n",
|
|
1385
|
+
" guidance = \"Mixed -> both short and long windows add value\"\n",
|
|
1386
|
+
" else:\n",
|
|
1387
|
+
" guidance = \"Entity-driven -> between-entity differences dominate\"\n",
|
|
1388
|
+
"\n",
|
|
1389
|
+
" within_str = f\"{within_cv:.2f}\" if not np.isinf(within_cv) else \"inf\"\n",
|
|
1390
|
+
" ratio_str = f\"{ratio:.2f}\" if not np.isinf(ratio) else \">10\"\n",
|
|
1391
|
+
" print(f\"{col:<25} {within_str:<12} {between_cv:<12.2f} {ratio_str:<8} {guidance}\")\n",
|
|
1392
|
+
"\n",
|
|
1393
|
+
" print(f\"\\nInterpretation:\")\n",
|
|
1394
|
+
" print(f\" Within-CV: how much each entity\\'s values vary across their events\")\n",
|
|
1395
|
+
" print(f\" Between-CV: how much entity averages differ from each other\")\n",
|
|
1396
|
+
" print(f\" Ratio > 1: temporal variation dominates -> shorter windows capture dynamics\")\n",
|
|
1397
|
+
" print(f\" Ratio < 1: entity identity dominates -> longer windows (or all_time) sufficient\")\n"
|
|
1398
|
+
]
|
|
1399
|
+
},
|
|
1400
|
+
{
|
|
1401
|
+
"cell_type": "markdown",
|
|
1402
|
+
"id": "0b74dd72",
|
|
1403
|
+
"metadata": {
|
|
1404
|
+
"papermill": {
|
|
1405
|
+
"duration": 0.026472,
|
|
1406
|
+
"end_time": "2026-02-02T13:00:49.019319",
|
|
1407
|
+
"exception": false,
|
|
1408
|
+
"start_time": "2026-02-02T13:00:48.992847",
|
|
1409
|
+
"status": "completed"
|
|
1410
|
+
},
|
|
1411
|
+
"tags": []
|
|
1412
|
+
},
|
|
1413
|
+
"source": [
|
|
1414
|
+
"## 1a.9 Update Findings with Time Series Metadata"
|
|
1415
|
+
]
|
|
1416
|
+
},
|
|
1417
|
+
{
|
|
1418
|
+
"cell_type": "code",
|
|
1419
|
+
"execution_count": null,
|
|
1420
|
+
"id": "1b8e715c",
|
|
1421
|
+
"metadata": {
|
|
1422
|
+
"execution": {
|
|
1423
|
+
"iopub.execute_input": "2026-02-02T13:00:49.079024Z",
|
|
1424
|
+
"iopub.status.busy": "2026-02-02T13:00:49.078908Z",
|
|
1425
|
+
"iopub.status.idle": "2026-02-02T13:00:49.097323Z",
|
|
1426
|
+
"shell.execute_reply": "2026-02-02T13:00:49.096468Z"
|
|
1427
|
+
},
|
|
1428
|
+
"papermill": {
|
|
1429
|
+
"duration": 0.05005,
|
|
1430
|
+
"end_time": "2026-02-02T13:00:49.098029",
|
|
1431
|
+
"exception": false,
|
|
1432
|
+
"start_time": "2026-02-02T13:00:49.047979",
|
|
1433
|
+
"status": "completed"
|
|
1434
|
+
},
|
|
1435
|
+
"tags": []
|
|
1436
|
+
},
|
|
1437
|
+
"outputs": [],
|
|
1438
|
+
"source": [
|
|
1439
|
+
"from customer_retention.analysis.auto_explorer.findings import TimeSeriesMetadata\n",
|
|
1440
|
+
"from customer_retention.stages.profiling import WindowRecommendationCollector\n",
|
|
1441
|
+
"\n",
|
|
1442
|
+
"# Build window recommendations from data coverage analysis\n",
|
|
1443
|
+
"window_collector = WindowRecommendationCollector(coverage_threshold=0.10)\n",
|
|
1444
|
+
"window_collector.add_segment_context(segment_result)\n",
|
|
1445
|
+
"window_collector.add_quadrant_context(quadrant_result)\n",
|
|
1446
|
+
"\n",
|
|
1447
|
+
"# Add inter-event timing context if available\n",
|
|
1448
|
+
"if inter_event_times:\n",
|
|
1449
|
+
" window_collector.add_inter_event_context(\n",
|
|
1450
|
+
" median_days=inter_event_series.median(),\n",
|
|
1451
|
+
" mean_days=inter_event_series.mean(),\n",
|
|
1452
|
+
" )\n",
|
|
1453
|
+
"\n",
|
|
1454
|
+
"window_result = window_collector.compute_union(\n",
|
|
1455
|
+
" lifecycles=quadrant_result.lifecycles,\n",
|
|
1456
|
+
" time_span_days=ts_profile.time_span_days,\n",
|
|
1457
|
+
" value_columns=len(numeric_cols),\n",
|
|
1458
|
+
" agg_funcs=4,\n",
|
|
1459
|
+
")\n",
|
|
1460
|
+
"\n",
|
|
1461
|
+
"print(f\"Selected windows: {window_result.windows}\")\n",
|
|
1462
|
+
"print(f\"Total features per entity: ~{window_result.feature_count_estimate}\\n\")\n",
|
|
1463
|
+
"\n",
|
|
1464
|
+
"explanation = window_result.explanation.drop(columns=[\"window_days\"]).copy()\n",
|
|
1465
|
+
"explanation[\"coverage_pct\"] = (explanation[\"coverage_pct\"] * 100).round(1).astype(str) + \"%\"\n",
|
|
1466
|
+
"explanation[\"meaningful_pct\"] = (explanation[\"meaningful_pct\"] * 100).round(1).astype(str) + \"%\"\n",
|
|
1467
|
+
"display_table(explanation)\n",
|
|
1468
|
+
"\n",
|
|
1469
|
+
"print(f\"\\nCoverage: % of entities with enough tenure AND expected >=2 events in that window\")\n",
|
|
1470
|
+
"print(f\"Meaningful: among entities with enough tenure, % that have sufficient event density\")"
|
|
1471
|
+
]
|
|
1472
|
+
},
|
|
1473
|
+
{
|
|
1474
|
+
"cell_type": "code",
|
|
1475
|
+
"execution_count": null,
|
|
1476
|
+
"id": "06cab0fb",
|
|
1477
|
+
"metadata": {
|
|
1478
|
+
"execution": {
|
|
1479
|
+
"iopub.execute_input": "2026-02-02T13:00:49.170783Z",
|
|
1480
|
+
"iopub.status.busy": "2026-02-02T13:00:49.170665Z",
|
|
1481
|
+
"iopub.status.idle": "2026-02-02T13:00:49.175737Z",
|
|
1482
|
+
"shell.execute_reply": "2026-02-02T13:00:49.175087Z"
|
|
1483
|
+
},
|
|
1484
|
+
"papermill": {
|
|
1485
|
+
"duration": 0.049936,
|
|
1486
|
+
"end_time": "2026-02-02T13:00:49.176343",
|
|
1487
|
+
"exception": false,
|
|
1488
|
+
"start_time": "2026-02-02T13:00:49.126407",
|
|
1489
|
+
"status": "completed"
|
|
1490
|
+
},
|
|
1491
|
+
"tags": []
|
|
1492
|
+
},
|
|
1493
|
+
"outputs": [],
|
|
1494
|
+
"source": [
|
|
1495
|
+
"h = window_result.heterogeneity\n",
|
|
1496
|
+
"\n",
|
|
1497
|
+
"print(\"Temporal Heterogeneity (eta-squared):\")\n",
|
|
1498
|
+
"print(f\" eta² measures the fraction of variance in a metric explained by lifecycle quadrant grouping.\")\n",
|
|
1499
|
+
"print(f\" Scale: 0 = no group differences, 1 = all variance is between groups.\")\n",
|
|
1500
|
+
"print(f\" Thresholds: <0.06 = low | 0.06-0.14 = moderate | >0.14 = high effect size\\n\")\n",
|
|
1501
|
+
"\n",
|
|
1502
|
+
"eta_max = max(h.eta_squared_intensity, h.eta_squared_event_count)\n",
|
|
1503
|
+
"print(f\" Intensity eta²: {h.eta_squared_intensity:.3f} {'<-- dominant' if h.eta_squared_intensity >= h.eta_squared_event_count else ''}\")\n",
|
|
1504
|
+
"print(f\" Event count eta²: {h.eta_squared_event_count:.3f} {'<-- dominant' if h.eta_squared_event_count > h.eta_squared_intensity else ''}\")\n",
|
|
1505
|
+
"print(f\" Overall level: {h.heterogeneity_level.upper()} (max eta² = {eta_max:.3f})\\n\")\n",
|
|
1506
|
+
"\n",
|
|
1507
|
+
"advisory_labels = {\n",
|
|
1508
|
+
" \"single_model\": \"Single model with union windows is appropriate\",\n",
|
|
1509
|
+
" \"consider_segment_feature\": \"Add lifecycle_quadrant as a categorical feature to the model\",\n",
|
|
1510
|
+
" \"consider_separate_models\": \"Consider separate models for entities with vs without history\",\n",
|
|
1511
|
+
"}\n",
|
|
1512
|
+
"advisory_text = advisory_labels.get(h.segmentation_advisory, h.segmentation_advisory)\n",
|
|
1513
|
+
"\n",
|
|
1514
|
+
"print(f\"Recommendation: {advisory_text}\")\n",
|
|
1515
|
+
"for r in h.advisory_rationale:\n",
|
|
1516
|
+
" print(f\" -> {r}\")\n",
|
|
1517
|
+
"print()\n",
|
|
1518
|
+
"display_table(h.coverage_table)"
|
|
1519
|
+
]
|
|
1520
|
+
},
|
|
1521
|
+
{
|
|
1522
|
+
"cell_type": "code",
|
|
1523
|
+
"execution_count": null,
|
|
1524
|
+
"id": "6483b029",
|
|
1525
|
+
"metadata": {
|
|
1526
|
+
"execution": {
|
|
1527
|
+
"iopub.execute_input": "2026-02-02T13:00:49.232605Z",
|
|
1528
|
+
"iopub.status.busy": "2026-02-02T13:00:49.232481Z",
|
|
1529
|
+
"iopub.status.idle": "2026-02-02T13:00:49.248795Z",
|
|
1530
|
+
"shell.execute_reply": "2026-02-02T13:00:49.248116Z"
|
|
1531
|
+
},
|
|
1532
|
+
"papermill": {
|
|
1533
|
+
"duration": 0.045594,
|
|
1534
|
+
"end_time": "2026-02-02T13:00:49.249286",
|
|
1535
|
+
"exception": false,
|
|
1536
|
+
"start_time": "2026-02-02T13:00:49.203692",
|
|
1537
|
+
"status": "completed"
|
|
1538
|
+
},
|
|
1539
|
+
"tags": []
|
|
1540
|
+
},
|
|
1541
|
+
"outputs": [],
|
|
1542
|
+
"source": [
|
|
1543
|
+
"advisory_labels = {\n",
|
|
1544
|
+
" \"single_model\": \"Single model with union windows is appropriate\",\n",
|
|
1545
|
+
" \"consider_segment_feature\": \"Add lifecycle_quadrant as a categorical feature to the model\",\n",
|
|
1546
|
+
" \"consider_separate_models\": \"Consider separate models for entities with vs without history\",\n",
|
|
1547
|
+
"}\n",
|
|
1548
|
+
"\n",
|
|
1549
|
+
"# Preserve temporal_pattern from original findings if available\n",
|
|
1550
|
+
"existing_pattern = findings.time_series_metadata.temporal_pattern if findings.time_series_metadata else None\n",
|
|
1551
|
+
"\n",
|
|
1552
|
+
"ts_metadata = TimeSeriesMetadata(\n",
|
|
1553
|
+
" granularity=DatasetGranularity.EVENT_LEVEL,\n",
|
|
1554
|
+
" temporal_pattern=existing_pattern,\n",
|
|
1555
|
+
" entity_column=ENTITY_COLUMN,\n",
|
|
1556
|
+
" time_column=TIME_COLUMN,\n",
|
|
1557
|
+
" avg_events_per_entity=ts_profile.events_per_entity.mean,\n",
|
|
1558
|
+
" time_span_days=ts_profile.time_span_days,\n",
|
|
1559
|
+
" unique_entities=ts_profile.unique_entities,\n",
|
|
1560
|
+
" suggested_aggregations=window_result.windows,\n",
|
|
1561
|
+
" window_coverage_threshold=window_result.coverage_threshold,\n",
|
|
1562
|
+
" heterogeneity_level=window_result.heterogeneity.heterogeneity_level,\n",
|
|
1563
|
+
" eta_squared_intensity=window_result.heterogeneity.eta_squared_intensity,\n",
|
|
1564
|
+
" eta_squared_event_count=window_result.heterogeneity.eta_squared_event_count,\n",
|
|
1565
|
+
" temporal_segmentation_advisory=window_result.heterogeneity.segmentation_advisory,\n",
|
|
1566
|
+
" temporal_segmentation_recommendation=advisory_labels.get(\n",
|
|
1567
|
+
" window_result.heterogeneity.segmentation_advisory,\n",
|
|
1568
|
+
" window_result.heterogeneity.segmentation_advisory,\n",
|
|
1569
|
+
" ),\n",
|
|
1570
|
+
" drift_risk_level=drift.risk_level,\n",
|
|
1571
|
+
" volume_drift_risk=drift.volume_drift_risk,\n",
|
|
1572
|
+
" population_stability=drift.population_stability,\n",
|
|
1573
|
+
" regime_count=drift.regime_count,\n",
|
|
1574
|
+
" recommended_training_start=(\n",
|
|
1575
|
+
" drift.recommended_training_start.isoformat() if drift.recommended_training_start else None\n",
|
|
1576
|
+
" ),\n",
|
|
1577
|
+
")\n",
|
|
1578
|
+
"\n",
|
|
1579
|
+
"findings.time_series_metadata = ts_metadata\n",
|
|
1580
|
+
"findings.save(FINDINGS_PATH)\n",
|
|
1581
|
+
"\n",
|
|
1582
|
+
"print(f\"Updated findings saved to: {FINDINGS_PATH}\")\n",
|
|
1583
|
+
"print(f\" Suggested aggregations: {ts_metadata.suggested_aggregations}\")\n",
|
|
1584
|
+
"print(f\" Heterogeneity: {ts_metadata.heterogeneity_level}\")\n",
|
|
1585
|
+
"print(f\" Recommendation: {ts_metadata.temporal_segmentation_recommendation}\")\n",
|
|
1586
|
+
"print(f\" Drift risk: {ts_metadata.drift_risk_level}\")\n"
|
|
1587
|
+
]
|
|
1588
|
+
},
|
|
1589
|
+
{
|
|
1590
|
+
"cell_type": "markdown",
|
|
1591
|
+
"id": "0a8eeb4c",
|
|
1592
|
+
"metadata": {
|
|
1593
|
+
"papermill": {
|
|
1594
|
+
"duration": 0.032613,
|
|
1595
|
+
"end_time": "2026-02-02T13:00:49.307825",
|
|
1596
|
+
"exception": false,
|
|
1597
|
+
"start_time": "2026-02-02T13:00:49.275212",
|
|
1598
|
+
"status": "completed"
|
|
1599
|
+
},
|
|
1600
|
+
"tags": []
|
|
1601
|
+
},
|
|
1602
|
+
"source": [
|
|
1603
|
+
"---\n",
|
|
1604
|
+
"\n",
|
|
1605
|
+
"## Summary: What We Learned\n",
|
|
1606
|
+
"\n",
|
|
1607
|
+
"In this notebook, we performed a deep dive on time series data:\n",
|
|
1608
|
+
"\n",
|
|
1609
|
+
"1. **Event Distribution** - Analyzed how events are distributed across entities\n",
|
|
1610
|
+
"2. **Activity Segments** - Categorized entities by activity level (one-time, low, medium, high)\n",
|
|
1611
|
+
"3. **Lifecycle Analysis** - Examined entity tenure and duration patterns\n",
|
|
1612
|
+
"4. **Temporal Coverage** - Visualized data volume over time\n",
|
|
1613
|
+
"5. **Inter-Event Timing** - Understood engagement frequency patterns\n",
|
|
1614
|
+
"6. **Feature Opportunities** - Identified time-window aggregations and recency features\n",
|
|
1615
|
+
"\n",
|
|
1616
|
+
"## Key Metrics for This Dataset\n",
|
|
1617
|
+
"\n",
|
|
1618
|
+
"| Metric | Value |\n",
|
|
1619
|
+
"|--------|-------|\n",
|
|
1620
|
+
"| Unique Entities | Fill from ts_profile |\n",
|
|
1621
|
+
"| Avg Events/Entity | Fill from ts_profile |\n",
|
|
1622
|
+
"| Median Lifecycle | Fill from analysis |\n",
|
|
1623
|
+
"| Median Inter-Event Days | Fill from analysis |\n",
|
|
1624
|
+
"\n",
|
|
1625
|
+
"---\n",
|
|
1626
|
+
"\n",
|
|
1627
|
+
"## Next Steps\n",
|
|
1628
|
+
"\n",
|
|
1629
|
+
"Continue with the **Event Bronze Track**:\n",
|
|
1630
|
+
"\n",
|
|
1631
|
+
"1. **01b_temporal_quality.ipynb** - Check for duplicate events, temporal gaps, future dates\n",
|
|
1632
|
+
"2. **01c_temporal_patterns.ipynb** - Detect trends, seasonality, cohort analysis\n",
|
|
1633
|
+
"3. **01d_event_aggregation.ipynb** - Aggregate events to entity-level (produces new dataset)\n",
|
|
1634
|
+
"\n",
|
|
1635
|
+
"After completing 01d, continue with the **Entity Bronze Track** (02 → 03 → 04) on the aggregated data."
|
|
1636
|
+
]
|
|
1637
|
+
},
|
|
1638
|
+
{
|
|
1639
|
+
"cell_type": "markdown",
|
|
1640
|
+
"id": "20a706ed",
|
|
1641
|
+
"metadata": {
|
|
1642
|
+
"papermill": {
|
|
1643
|
+
"duration": 0.027747,
|
|
1644
|
+
"end_time": "2026-02-02T13:00:49.364696",
|
|
1645
|
+
"exception": false,
|
|
1646
|
+
"start_time": "2026-02-02T13:00:49.336949",
|
|
1647
|
+
"status": "completed"
|
|
1648
|
+
},
|
|
1649
|
+
"tags": []
|
|
1650
|
+
},
|
|
1651
|
+
"source": [
|
|
1652
|
+
"> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
|
|
1653
|
+
"> The next notebook will automatically export this notebook's HTML documentation from the saved file."
|
|
1654
|
+
]
|
|
1655
|
+
}
|
|
1656
|
+
],
|
|
1657
|
+
"metadata": {
|
|
1658
|
+
"kernelspec": {
|
|
1659
|
+
"display_name": "Python 3",
|
|
1660
|
+
"language": "python",
|
|
1661
|
+
"name": "python3"
|
|
1662
|
+
},
|
|
1663
|
+
"language_info": {
|
|
1664
|
+
"codemirror_mode": {
|
|
1665
|
+
"name": "ipython",
|
|
1666
|
+
"version": 3
|
|
1667
|
+
},
|
|
1668
|
+
"file_extension": ".py",
|
|
1669
|
+
"mimetype": "text/x-python",
|
|
1670
|
+
"name": "python",
|
|
1671
|
+
"nbconvert_exporter": "python",
|
|
1672
|
+
"pygments_lexer": "ipython3",
|
|
1673
|
+
"version": "3.12.4"
|
|
1674
|
+
},
|
|
1675
|
+
"papermill": {
|
|
1676
|
+
"default_parameters": {},
|
|
1677
|
+
"duration": 9.174603,
|
|
1678
|
+
"end_time": "2026-02-02T13:00:52.007930",
|
|
1679
|
+
"environment_variables": {},
|
|
1680
|
+
"exception": null,
|
|
1681
|
+
"input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/01a_temporal_deep_dive.ipynb",
|
|
1682
|
+
"output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/01a_temporal_deep_dive.ipynb",
|
|
1683
|
+
"parameters": {},
|
|
1684
|
+
"start_time": "2026-02-02T13:00:42.833327",
|
|
1685
|
+
"version": "2.6.0"
|
|
1686
|
+
}
|
|
1687
|
+
},
|
|
1688
|
+
"nbformat": 4,
|
|
1689
|
+
"nbformat_minor": 5
|
|
1690
|
+
}
|