churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,3305 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "b520cf17",
|
|
6
|
+
"metadata": {
|
|
7
|
+
"papermill": {
|
|
8
|
+
"duration": 0.005229,
|
|
9
|
+
"end_time": "2026-02-02T13:01:08.982002",
|
|
10
|
+
"exception": false,
|
|
11
|
+
"start_time": "2026-02-02T13:01:08.976773",
|
|
12
|
+
"status": "completed"
|
|
13
|
+
},
|
|
14
|
+
"tags": []
|
|
15
|
+
},
|
|
16
|
+
"source": [
|
|
17
|
+
"# Chapter 1c: Temporal Pattern Analysis (Event Bronze Track)\n",
|
|
18
|
+
"\n",
|
|
19
|
+
"**Purpose:** Discover temporal patterns in event-level data that inform feature engineering and model design.\n",
|
|
20
|
+
"\n",
|
|
21
|
+
"**When to use this notebook:**\n",
|
|
22
|
+
"- After completing 01a and 01b (temporal deep dive and quality checks)\n",
|
|
23
|
+
"- Your dataset is EVENT_LEVEL granularity\n",
|
|
24
|
+
"- You want to understand time-based patterns before aggregation\n",
|
|
25
|
+
"\n",
|
|
26
|
+
"**What you'll learn:**\n",
|
|
27
|
+
"- How to detect long-term trends in your data\n",
|
|
28
|
+
"- How to identify seasonality patterns (weekly, monthly)\n",
|
|
29
|
+
"- How cohort analysis reveals customer lifecycle patterns\n",
|
|
30
|
+
"- How recency relates to target outcomes\n",
|
|
31
|
+
"\n",
|
|
32
|
+
"**Pattern Categories:**\n",
|
|
33
|
+
"\n",
|
|
34
|
+
"| Pattern | Description | Feature Engineering Impact |\n",
|
|
35
|
+
"|---------|-------------|---------------------------|\n",
|
|
36
|
+
"| **Trend** | Long-term direction (up/down) | Detrend features, add trend slope |\n",
|
|
37
|
+
"| **Seasonality** | Periodic patterns (weekly, monthly) | Add cyclical encodings, seasonal indicators |\n",
|
|
38
|
+
"| **Cohort Effects** | Behavior varies by join date | Add cohort features, stratify models |\n",
|
|
39
|
+
"| **Recency Effects** | Recent activity predicts outcomes | Prioritize recent time windows |"
|
|
40
|
+
]
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"cell_type": "markdown",
|
|
44
|
+
"id": "60ff56d9",
|
|
45
|
+
"metadata": {
|
|
46
|
+
"papermill": {
|
|
47
|
+
"duration": 0.004298,
|
|
48
|
+
"end_time": "2026-02-02T13:01:08.991551",
|
|
49
|
+
"exception": false,
|
|
50
|
+
"start_time": "2026-02-02T13:01:08.987253",
|
|
51
|
+
"status": "completed"
|
|
52
|
+
},
|
|
53
|
+
"tags": []
|
|
54
|
+
},
|
|
55
|
+
"source": [
|
|
56
|
+
"## 1c.1 Load Findings and Data"
|
|
57
|
+
]
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"cell_type": "code",
|
|
61
|
+
"execution_count": null,
|
|
62
|
+
"id": "bad2476c",
|
|
63
|
+
"metadata": {
|
|
64
|
+
"execution": {
|
|
65
|
+
"iopub.execute_input": "2026-02-02T13:01:09.001140Z",
|
|
66
|
+
"iopub.status.busy": "2026-02-02T13:01:09.000972Z",
|
|
67
|
+
"iopub.status.idle": "2026-02-02T13:01:10.816526Z",
|
|
68
|
+
"shell.execute_reply": "2026-02-02T13:01:10.815331Z"
|
|
69
|
+
},
|
|
70
|
+
"papermill": {
|
|
71
|
+
"duration": 1.821681,
|
|
72
|
+
"end_time": "2026-02-02T13:01:10.817446",
|
|
73
|
+
"exception": false,
|
|
74
|
+
"start_time": "2026-02-02T13:01:08.995765",
|
|
75
|
+
"status": "completed"
|
|
76
|
+
},
|
|
77
|
+
"tags": []
|
|
78
|
+
},
|
|
79
|
+
"outputs": [],
|
|
80
|
+
"source": [
|
|
81
|
+
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
82
|
+
"track_and_export_previous(\"01c_temporal_patterns.ipynb\")\n",
|
|
83
|
+
"\n",
|
|
84
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
85
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
|
|
86
|
+
"from customer_retention.core.config.column_config import ColumnType, DatasetGranularity\n",
|
|
87
|
+
"from customer_retention.stages.profiling import (\n",
|
|
88
|
+
" TemporalPatternAnalyzer, TemporalPatternAnalysis,\n",
|
|
89
|
+
" TrendResult, TrendDirection, SeasonalityResult, RecencyResult,\n",
|
|
90
|
+
" TemporalFeatureAnalyzer, VelocityResult, MomentumResult,\n",
|
|
91
|
+
" LagCorrelationResult, PredictivePowerResult, FeatureRecommendation,\n",
|
|
92
|
+
" CategoricalTargetAnalyzer\n",
|
|
93
|
+
")\n",
|
|
94
|
+
"import pandas as pd\n",
|
|
95
|
+
"import numpy as np\n",
|
|
96
|
+
"import plotly.graph_objects as go\n",
|
|
97
|
+
"import plotly.express as px\n",
|
|
98
|
+
"from plotly.subplots import make_subplots\n",
|
|
99
|
+
"from scipy import stats\n",
|
|
100
|
+
"from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
|
|
101
|
+
]
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
"cell_type": "code",
|
|
105
|
+
"execution_count": null,
|
|
106
|
+
"id": "19b4e63f",
|
|
107
|
+
"metadata": {
|
|
108
|
+
"execution": {
|
|
109
|
+
"iopub.execute_input": "2026-02-02T13:01:10.827670Z",
|
|
110
|
+
"iopub.status.busy": "2026-02-02T13:01:10.827523Z",
|
|
111
|
+
"iopub.status.idle": "2026-02-02T13:01:10.856634Z",
|
|
112
|
+
"shell.execute_reply": "2026-02-02T13:01:10.855878Z"
|
|
113
|
+
},
|
|
114
|
+
"papermill": {
|
|
115
|
+
"duration": 0.035888,
|
|
116
|
+
"end_time": "2026-02-02T13:01:10.857491",
|
|
117
|
+
"exception": false,
|
|
118
|
+
"start_time": "2026-02-02T13:01:10.821603",
|
|
119
|
+
"status": "completed"
|
|
120
|
+
},
|
|
121
|
+
"tags": []
|
|
122
|
+
},
|
|
123
|
+
"outputs": [],
|
|
124
|
+
"source": [
|
|
125
|
+
"# === CONFIGURATION ===\n",
|
|
126
|
+
"from pathlib import Path\n",
|
|
127
|
+
"\n",
|
|
128
|
+
"# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
|
|
129
|
+
"\n",
|
|
130
|
+
"findings_files = [f for f in FINDINGS_DIR.glob(\"*_findings.yaml\") if \"multi_dataset\" not in f.name]\n",
|
|
131
|
+
"if not findings_files:\n",
|
|
132
|
+
" raise FileNotFoundError(f\"No findings files found in {FINDINGS_DIR}. Run notebook 01 first.\")\n",
|
|
133
|
+
"\n",
|
|
134
|
+
"findings_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
135
|
+
"FINDINGS_PATH = str(findings_files[0])\n",
|
|
136
|
+
"\n",
|
|
137
|
+
"print(f\"Using: {FINDINGS_PATH}\")\n",
|
|
138
|
+
"findings = ExplorationFindings.load(FINDINGS_PATH)\n",
|
|
139
|
+
"print(f\"Loaded findings for {findings.column_count} columns\")"
|
|
140
|
+
]
|
|
141
|
+
},
|
|
142
|
+
{
|
|
143
|
+
"cell_type": "code",
|
|
144
|
+
"execution_count": null,
|
|
145
|
+
"id": "9dffc317",
|
|
146
|
+
"metadata": {
|
|
147
|
+
"execution": {
|
|
148
|
+
"iopub.execute_input": "2026-02-02T13:01:10.866594Z",
|
|
149
|
+
"iopub.status.busy": "2026-02-02T13:01:10.866471Z",
|
|
150
|
+
"iopub.status.idle": "2026-02-02T13:01:10.869429Z",
|
|
151
|
+
"shell.execute_reply": "2026-02-02T13:01:10.868628Z"
|
|
152
|
+
},
|
|
153
|
+
"papermill": {
|
|
154
|
+
"duration": 0.00852,
|
|
155
|
+
"end_time": "2026-02-02T13:01:10.870225",
|
|
156
|
+
"exception": false,
|
|
157
|
+
"start_time": "2026-02-02T13:01:10.861705",
|
|
158
|
+
"status": "completed"
|
|
159
|
+
},
|
|
160
|
+
"tags": []
|
|
161
|
+
},
|
|
162
|
+
"outputs": [],
|
|
163
|
+
"source": [
|
|
164
|
+
"# Get time series configuration\n",
|
|
165
|
+
"ts_meta = findings.time_series_metadata\n",
|
|
166
|
+
"ENTITY_COLUMN = ts_meta.entity_column if ts_meta else None\n",
|
|
167
|
+
"TIME_COLUMN = ts_meta.time_column if ts_meta else None\n",
|
|
168
|
+
"\n",
|
|
169
|
+
"print(f\"Entity column: {ENTITY_COLUMN}\")\n",
|
|
170
|
+
"print(f\"Time column: {TIME_COLUMN}\")\n",
|
|
171
|
+
"\n",
|
|
172
|
+
"# Note: Target column configuration is handled in section 1c.2 below\n",
|
|
173
|
+
"# This allows for event-level to entity-level aggregation when needed"
|
|
174
|
+
]
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
"cell_type": "code",
|
|
178
|
+
"execution_count": null,
|
|
179
|
+
"id": "dd209a65",
|
|
180
|
+
"metadata": {
|
|
181
|
+
"execution": {
|
|
182
|
+
"iopub.execute_input": "2026-02-02T13:01:10.879383Z",
|
|
183
|
+
"iopub.status.busy": "2026-02-02T13:01:10.879231Z",
|
|
184
|
+
"iopub.status.idle": "2026-02-02T13:01:11.570935Z",
|
|
185
|
+
"shell.execute_reply": "2026-02-02T13:01:11.570426Z"
|
|
186
|
+
},
|
|
187
|
+
"papermill": {
|
|
188
|
+
"duration": 0.697139,
|
|
189
|
+
"end_time": "2026-02-02T13:01:11.571589",
|
|
190
|
+
"exception": false,
|
|
191
|
+
"start_time": "2026-02-02T13:01:10.874450",
|
|
192
|
+
"status": "completed"
|
|
193
|
+
},
|
|
194
|
+
"tags": []
|
|
195
|
+
},
|
|
196
|
+
"outputs": [],
|
|
197
|
+
"source": [
|
|
198
|
+
"from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
|
|
199
|
+
"\n",
|
|
200
|
+
"# Load source data (prefers snapshots over raw files)\n",
|
|
201
|
+
"df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
|
|
202
|
+
"charts = ChartBuilder()\n",
|
|
203
|
+
"\n",
|
|
204
|
+
"# Parse time column\n",
|
|
205
|
+
"df[TIME_COLUMN] = pd.to_datetime(df[TIME_COLUMN])\n",
|
|
206
|
+
"\n",
|
|
207
|
+
"print(f\"Loaded {len(df):,} rows x {len(df.columns)} columns\")\n",
|
|
208
|
+
"print(f\"Data source: {data_source}\")"
|
|
209
|
+
]
|
|
210
|
+
},
|
|
211
|
+
{
|
|
212
|
+
"cell_type": "markdown",
|
|
213
|
+
"id": "d7bff166",
|
|
214
|
+
"metadata": {
|
|
215
|
+
"papermill": {
|
|
216
|
+
"duration": 0.003697,
|
|
217
|
+
"end_time": "2026-02-02T13:01:11.579477",
|
|
218
|
+
"exception": false,
|
|
219
|
+
"start_time": "2026-02-02T13:01:11.575780",
|
|
220
|
+
"status": "completed"
|
|
221
|
+
},
|
|
222
|
+
"tags": []
|
|
223
|
+
},
|
|
224
|
+
"source": [
|
|
225
|
+
"## 1c.2 Target Column Configuration\n",
|
|
226
|
+
"\n",
|
|
227
|
+
"**š Event-Level vs Entity-Level Targets:**\n",
|
|
228
|
+
"\n",
|
|
229
|
+
"In time series data, targets can be defined at different granularities:\n",
|
|
230
|
+
"\n",
|
|
231
|
+
"| Target Level | Example | Usage |\n",
|
|
232
|
+
"|--------------|---------|-------|\n",
|
|
233
|
+
"| **Event-level** | \"Did this email get clicked?\" | Exists in raw data |\n",
|
|
234
|
+
"| **Entity-level** | \"Did this customer churn?\" | Need to join from entity table |\n",
|
|
235
|
+
"\n",
|
|
236
|
+
"If your target is entity-level, you may need to join it or configure it manually."
|
|
237
|
+
]
|
|
238
|
+
},
|
|
239
|
+
{
|
|
240
|
+
"cell_type": "code",
|
|
241
|
+
"execution_count": null,
|
|
242
|
+
"id": "9e9c32ab",
|
|
243
|
+
"metadata": {
|
|
244
|
+
"execution": {
|
|
245
|
+
"iopub.execute_input": "2026-02-02T13:01:11.589468Z",
|
|
246
|
+
"iopub.status.busy": "2026-02-02T13:01:11.589238Z",
|
|
247
|
+
"iopub.status.idle": "2026-02-02T13:01:11.613692Z",
|
|
248
|
+
"shell.execute_reply": "2026-02-02T13:01:11.613146Z"
|
|
249
|
+
},
|
|
250
|
+
"papermill": {
|
|
251
|
+
"duration": 0.02988,
|
|
252
|
+
"end_time": "2026-02-02T13:01:11.614340",
|
|
253
|
+
"exception": false,
|
|
254
|
+
"start_time": "2026-02-02T13:01:11.584460",
|
|
255
|
+
"status": "completed"
|
|
256
|
+
},
|
|
257
|
+
"tags": []
|
|
258
|
+
},
|
|
259
|
+
"outputs": [],
|
|
260
|
+
"source": [
|
|
261
|
+
"# === TARGET CONFIGURATION ===\n",
|
|
262
|
+
"# Override target column if needed (None = auto-detect, \"DEFER_TO_MULTI_DATASET\" = skip)\n",
|
|
263
|
+
"TARGET_COLUMN_OVERRIDE = None\n",
|
|
264
|
+
"TARGET_AGGREGATION = \"max\" # Options: \"max\", \"mean\", \"sum\", \"last\", \"first\"\n",
|
|
265
|
+
"\n",
|
|
266
|
+
"# Detect and analyze target\n",
|
|
267
|
+
"from customer_retention.stages.profiling import (\n",
|
|
268
|
+
" TargetLevelAnalyzer, TargetColumnDetector, AggregationMethod\n",
|
|
269
|
+
")\n",
|
|
270
|
+
"\n",
|
|
271
|
+
"detector = TargetColumnDetector()\n",
|
|
272
|
+
"target_col, method = detector.detect(findings, df, override=TARGET_COLUMN_OVERRIDE)\n",
|
|
273
|
+
"detector.print_detection(target_col, method)\n",
|
|
274
|
+
"\n",
|
|
275
|
+
"TARGET_COLUMN = target_col\n",
|
|
276
|
+
"if TARGET_COLUMN and TARGET_COLUMN in df.columns and ENTITY_COLUMN:\n",
|
|
277
|
+
" analyzer = TargetLevelAnalyzer()\n",
|
|
278
|
+
" agg_method = AggregationMethod(TARGET_AGGREGATION)\n",
|
|
279
|
+
" df, result = analyzer.aggregate_to_entity(df, TARGET_COLUMN, ENTITY_COLUMN, TIME_COLUMN, agg_method)\n",
|
|
280
|
+
" analyzer.print_analysis(result)\n",
|
|
281
|
+
" \n",
|
|
282
|
+
" # Update TARGET_COLUMN to entity-level version if aggregated\n",
|
|
283
|
+
" if result.entity_target_column:\n",
|
|
284
|
+
" ORIGINAL_TARGET = TARGET_COLUMN\n",
|
|
285
|
+
" TARGET_COLUMN = result.entity_target_column\n",
|
|
286
|
+
"\n",
|
|
287
|
+
"print(\"\\n\" + \"ā\"*70)\n",
|
|
288
|
+
"print(f\"Final configuration:\")\n",
|
|
289
|
+
"print(f\" ENTITY_COLUMN: {ENTITY_COLUMN}\")\n",
|
|
290
|
+
"print(f\" TIME_COLUMN: {TIME_COLUMN}\")\n",
|
|
291
|
+
"print(f\" TARGET_COLUMN: {TARGET_COLUMN}\")\n",
|
|
292
|
+
"print(\"ā\"*70)\n"
|
|
293
|
+
]
|
|
294
|
+
},
|
|
295
|
+
{
|
|
296
|
+
"cell_type": "markdown",
|
|
297
|
+
"id": "3fb8f311",
|
|
298
|
+
"metadata": {
|
|
299
|
+
"papermill": {
|
|
300
|
+
"duration": 0.003734,
|
|
301
|
+
"end_time": "2026-02-02T13:01:11.622150",
|
|
302
|
+
"exception": false,
|
|
303
|
+
"start_time": "2026-02-02T13:01:11.618416",
|
|
304
|
+
"status": "completed"
|
|
305
|
+
},
|
|
306
|
+
"tags": []
|
|
307
|
+
},
|
|
308
|
+
"source": [
|
|
309
|
+
"## 1c.3 Aggregation Window Configuration\n",
|
|
310
|
+
"\n",
|
|
311
|
+
"**āļø Central Configuration for All Pattern Analysis**\n",
|
|
312
|
+
"\n",
|
|
313
|
+
"Windows are loaded from 01a findings and used consistently throughout this notebook for:\n",
|
|
314
|
+
"- Velocity analysis (shortest window)\n",
|
|
315
|
+
"- Momentum analysis (window pairs)\n",
|
|
316
|
+
"- Rolling statistics\n",
|
|
317
|
+
"- Feature engineering recommendations\n",
|
|
318
|
+
"\n",
|
|
319
|
+
"Override below if needed for your specific analysis.\n"
|
|
320
|
+
]
|
|
321
|
+
},
|
|
322
|
+
{
|
|
323
|
+
"cell_type": "code",
|
|
324
|
+
"execution_count": null,
|
|
325
|
+
"id": "0e8069c1",
|
|
326
|
+
"metadata": {
|
|
327
|
+
"execution": {
|
|
328
|
+
"iopub.execute_input": "2026-02-02T13:01:11.630971Z",
|
|
329
|
+
"iopub.status.busy": "2026-02-02T13:01:11.630852Z",
|
|
330
|
+
"iopub.status.idle": "2026-02-02T13:01:11.634574Z",
|
|
331
|
+
"shell.execute_reply": "2026-02-02T13:01:11.633796Z"
|
|
332
|
+
},
|
|
333
|
+
"papermill": {
|
|
334
|
+
"duration": 0.008656,
|
|
335
|
+
"end_time": "2026-02-02T13:01:11.635049",
|
|
336
|
+
"exception": false,
|
|
337
|
+
"start_time": "2026-02-02T13:01:11.626393",
|
|
338
|
+
"status": "completed"
|
|
339
|
+
},
|
|
340
|
+
"tags": []
|
|
341
|
+
},
|
|
342
|
+
"outputs": [],
|
|
343
|
+
"source": [
|
|
344
|
+
"# === AGGREGATION WINDOW CONFIGURATION ===\n",
|
|
345
|
+
"# These windows were recommended by 01a based on your data's temporal coverage.\n",
|
|
346
|
+
"# They are used consistently for velocity, momentum, rolling stats, and feature engineering.\n",
|
|
347
|
+
"\n",
|
|
348
|
+
"# Override: Set to a list like [\"7d\", \"30d\", \"90d\"] to use custom windows\n",
|
|
349
|
+
"# Set to None to use 01a recommendations\n",
|
|
350
|
+
"WINDOW_OVERRIDE = None\n",
|
|
351
|
+
"\n",
|
|
352
|
+
"from customer_retention.stages.profiling import PatternAnalysisConfig\n",
|
|
353
|
+
"\n",
|
|
354
|
+
"pattern_config = PatternAnalysisConfig.from_findings(\n",
|
|
355
|
+
" findings,\n",
|
|
356
|
+
" target_column=TARGET_COLUMN,\n",
|
|
357
|
+
" window_override=WINDOW_OVERRIDE,\n",
|
|
358
|
+
")\n",
|
|
359
|
+
"\n",
|
|
360
|
+
"# Display configuration\n",
|
|
361
|
+
"print(\"=\"*70)\n",
|
|
362
|
+
"print(\"AGGREGATION WINDOW CONFIGURATION\")\n",
|
|
363
|
+
"print(\"=\"*70)\n",
|
|
364
|
+
"print(f\"\\nSource: {'Manual override' if WINDOW_OVERRIDE else '01a findings (recommended)'}\")\n",
|
|
365
|
+
"print(f\"\\nWindows: {pattern_config.aggregation_windows}\")\n",
|
|
366
|
+
"print(f\"\\nDerived settings used throughout this notebook:\")\n",
|
|
367
|
+
"print(f\" ⢠Velocity/Rolling window: {pattern_config.velocity_window_days} days\")\n",
|
|
368
|
+
"print(f\" ⢠Momentum pairs: {pattern_config.get_momentum_pairs()}\")\n",
|
|
369
|
+
"print(f\"\\nš” To override, set WINDOW_OVERRIDE = ['7d', '30d', '90d'] above and re-run\")\n"
|
|
370
|
+
]
|
|
371
|
+
},
|
|
372
|
+
{
|
|
373
|
+
"cell_type": "markdown",
|
|
374
|
+
"id": "5070c63e",
|
|
375
|
+
"metadata": {
|
|
376
|
+
"papermill": {
|
|
377
|
+
"duration": 0.00372,
|
|
378
|
+
"end_time": "2026-02-02T13:01:11.643084",
|
|
379
|
+
"exception": false,
|
|
380
|
+
"start_time": "2026-02-02T13:01:11.639364",
|
|
381
|
+
"status": "completed"
|
|
382
|
+
},
|
|
383
|
+
"tags": []
|
|
384
|
+
},
|
|
385
|
+
"source": [
|
|
386
|
+
"## 1c.4 Configure Value Column for Analysis\n",
|
|
387
|
+
"\n",
|
|
388
|
+
"Temporal patterns are analyzed on aggregated metrics. Choose the primary metric to analyze."
|
|
389
|
+
]
|
|
390
|
+
},
|
|
391
|
+
{
|
|
392
|
+
"cell_type": "code",
|
|
393
|
+
"execution_count": null,
|
|
394
|
+
"id": "c6cc58ef",
|
|
395
|
+
"metadata": {
|
|
396
|
+
"execution": {
|
|
397
|
+
"iopub.execute_input": "2026-02-02T13:01:11.652297Z",
|
|
398
|
+
"iopub.status.busy": "2026-02-02T13:01:11.652178Z",
|
|
399
|
+
"iopub.status.idle": "2026-02-02T13:01:11.656977Z",
|
|
400
|
+
"shell.execute_reply": "2026-02-02T13:01:11.656499Z"
|
|
401
|
+
},
|
|
402
|
+
"papermill": {
|
|
403
|
+
"duration": 0.01026,
|
|
404
|
+
"end_time": "2026-02-02T13:01:11.657923",
|
|
405
|
+
"exception": false,
|
|
406
|
+
"start_time": "2026-02-02T13:01:11.647663",
|
|
407
|
+
"status": "completed"
|
|
408
|
+
},
|
|
409
|
+
"tags": []
|
|
410
|
+
},
|
|
411
|
+
"outputs": [],
|
|
412
|
+
"source": [
|
|
413
|
+
"# Find numeric columns for pattern analysis\n",
|
|
414
|
+
"numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()\n",
|
|
415
|
+
"numeric_cols = [c for c in numeric_cols if c not in [ENTITY_COLUMN] and c not in TEMPORAL_METADATA_COLS]\n",
|
|
416
|
+
"\n",
|
|
417
|
+
"# Separate target columns from feature columns\n",
|
|
418
|
+
"target_cols = [c for c in numeric_cols if c.lower() in ['target', 'target_entity', 'label'] \n",
|
|
419
|
+
" or (TARGET_COLUMN and c.lower() == TARGET_COLUMN.lower())]\n",
|
|
420
|
+
"feature_cols = [c for c in numeric_cols if c not in target_cols]\n",
|
|
421
|
+
"\n",
|
|
422
|
+
"print(\"Numeric columns for pattern analysis:\")\n",
|
|
423
|
+
"print(\"\\n FEATURE COLUMNS (can derive features from):\")\n",
|
|
424
|
+
"for col in feature_cols:\n",
|
|
425
|
+
" print(f\" - {col}\")\n",
|
|
426
|
+
"\n",
|
|
427
|
+
"if target_cols:\n",
|
|
428
|
+
" print(\"\\n TARGET COLUMNS (analysis only - never derive features):\")\n",
|
|
429
|
+
" for col in target_cols:\n",
|
|
430
|
+
" print(f\" - {col} [TARGET]\")\n",
|
|
431
|
+
"\n",
|
|
432
|
+
"# Default: use event count (most common for pattern detection)\n",
|
|
433
|
+
"# Change this to analyze patterns in a specific metric\n",
|
|
434
|
+
"VALUE_COLUMN = \"_event_count\" # Special: will aggregate event counts"
|
|
435
|
+
]
|
|
436
|
+
},
|
|
437
|
+
{
|
|
438
|
+
"cell_type": "code",
|
|
439
|
+
"execution_count": null,
|
|
440
|
+
"id": "8107373c",
|
|
441
|
+
"metadata": {
|
|
442
|
+
"execution": {
|
|
443
|
+
"iopub.execute_input": "2026-02-02T13:01:11.668478Z",
|
|
444
|
+
"iopub.status.busy": "2026-02-02T13:01:11.668364Z",
|
|
445
|
+
"iopub.status.idle": "2026-02-02T13:01:11.685862Z",
|
|
446
|
+
"shell.execute_reply": "2026-02-02T13:01:11.685471Z"
|
|
447
|
+
},
|
|
448
|
+
"papermill": {
|
|
449
|
+
"duration": 0.023666,
|
|
450
|
+
"end_time": "2026-02-02T13:01:11.686495",
|
|
451
|
+
"exception": false,
|
|
452
|
+
"start_time": "2026-02-02T13:01:11.662829",
|
|
453
|
+
"status": "completed"
|
|
454
|
+
},
|
|
455
|
+
"tags": []
|
|
456
|
+
},
|
|
457
|
+
"outputs": [],
|
|
458
|
+
"source": [
|
|
459
|
+
"# Prepare data for pattern analysis\n",
|
|
460
|
+
"# Aggregate to daily level for trend/seasonality detection\n",
|
|
461
|
+
"\n",
|
|
462
|
+
"if VALUE_COLUMN == \"_event_count\":\n",
|
|
463
|
+
" # Aggregate event counts by day\n",
|
|
464
|
+
" daily_data = df.groupby(df[TIME_COLUMN].dt.date).size().reset_index()\n",
|
|
465
|
+
" daily_data.columns = [TIME_COLUMN, \"value\"]\n",
|
|
466
|
+
" daily_data[TIME_COLUMN] = pd.to_datetime(daily_data[TIME_COLUMN])\n",
|
|
467
|
+
" analysis_col = \"value\"\n",
|
|
468
|
+
" print(\"Analyzing: Daily event counts\")\n",
|
|
469
|
+
"else:\n",
|
|
470
|
+
" # Aggregate specific column by day\n",
|
|
471
|
+
" daily_data = df.groupby(df[TIME_COLUMN].dt.date)[VALUE_COLUMN].sum().reset_index()\n",
|
|
472
|
+
" daily_data.columns = [TIME_COLUMN, \"value\"]\n",
|
|
473
|
+
" daily_data[TIME_COLUMN] = pd.to_datetime(daily_data[TIME_COLUMN])\n",
|
|
474
|
+
" analysis_col = \"value\"\n",
|
|
475
|
+
" print(f\"Analyzing: Daily sum of {VALUE_COLUMN}\")\n",
|
|
476
|
+
"\n",
|
|
477
|
+
"print(f\"\\nDaily data points: {len(daily_data)}\")\n",
|
|
478
|
+
"print(f\"Date range: {daily_data[TIME_COLUMN].min()} to {daily_data[TIME_COLUMN].max()}\")"
|
|
479
|
+
]
|
|
480
|
+
},
|
|
481
|
+
{
|
|
482
|
+
"cell_type": "markdown",
|
|
483
|
+
"id": "4c4b5039",
|
|
484
|
+
"metadata": {
|
|
485
|
+
"papermill": {
|
|
486
|
+
"duration": 0.003749,
|
|
487
|
+
"end_time": "2026-02-02T13:01:11.694447",
|
|
488
|
+
"exception": false,
|
|
489
|
+
"start_time": "2026-02-02T13:01:11.690698",
|
|
490
|
+
"status": "completed"
|
|
491
|
+
},
|
|
492
|
+
"tags": []
|
|
493
|
+
},
|
|
494
|
+
"source": [
|
|
495
|
+
"## 1c.5 Trend Detection\n",
|
|
496
|
+
"\n",
|
|
497
|
+
"**š Understanding Trends:**\n",
|
|
498
|
+
"- **Increasing**: Metric growing over time (e.g., expanding customer base)\n",
|
|
499
|
+
"- **Decreasing**: Metric shrinking (e.g., declining engagement)\n",
|
|
500
|
+
"- **Stationary**: No significant trend (stable business)\n",
|
|
501
|
+
"\n",
|
|
502
|
+
"**Impact on ML:**\n",
|
|
503
|
+
"- Strong trends can cause data leakage if not handled\n",
|
|
504
|
+
"- Consider detrending or adding trend as explicit feature"
|
|
505
|
+
]
|
|
506
|
+
},
|
|
507
|
+
{
|
|
508
|
+
"cell_type": "code",
|
|
509
|
+
"execution_count": null,
|
|
510
|
+
"id": "1ca5d928",
|
|
511
|
+
"metadata": {
|
|
512
|
+
"execution": {
|
|
513
|
+
"iopub.execute_input": "2026-02-02T13:01:11.703538Z",
|
|
514
|
+
"iopub.status.busy": "2026-02-02T13:01:11.703416Z",
|
|
515
|
+
"iopub.status.idle": "2026-02-02T13:01:11.742499Z",
|
|
516
|
+
"shell.execute_reply": "2026-02-02T13:01:11.741897Z"
|
|
517
|
+
},
|
|
518
|
+
"papermill": {
|
|
519
|
+
"duration": 0.04516,
|
|
520
|
+
"end_time": "2026-02-02T13:01:11.744013",
|
|
521
|
+
"exception": false,
|
|
522
|
+
"start_time": "2026-02-02T13:01:11.698853",
|
|
523
|
+
"status": "completed"
|
|
524
|
+
},
|
|
525
|
+
"tags": []
|
|
526
|
+
},
|
|
527
|
+
"outputs": [],
|
|
528
|
+
"source": [
|
|
529
|
+
"# Trend Analysis - computation and visualization\n",
|
|
530
|
+
"from customer_retention.stages.profiling import generate_trend_recommendations\n",
|
|
531
|
+
"\n",
|
|
532
|
+
"analyzer = TemporalPatternAnalyzer(time_column=TIME_COLUMN)\n",
|
|
533
|
+
"trend_result = analyzer.detect_trend(daily_data, value_column=analysis_col)\n",
|
|
534
|
+
"trend_recs = generate_trend_recommendations(trend_result, mean_value=daily_data[analysis_col].mean())\n",
|
|
535
|
+
"\n",
|
|
536
|
+
"# Visualization\n",
|
|
537
|
+
"direction_emoji = {\"increasing\": \"š\", \"decreasing\": \"š\", \"stable\": \"ā”ļø\", \"unknown\": \"ā\"}\n",
|
|
538
|
+
"print(f\"Trend: {direction_emoji.get(trend_result.direction.value, '')} {trend_result.direction.value.upper()} (R²={trend_result.strength:.2f})\")\n",
|
|
539
|
+
"\n",
|
|
540
|
+
"fig = go.Figure()\n",
|
|
541
|
+
"fig.add_trace(go.Scatter(\n",
|
|
542
|
+
" x=daily_data[TIME_COLUMN], y=daily_data[analysis_col],\n",
|
|
543
|
+
" mode=\"lines\", name=\"Daily Values\", line=dict(color=\"steelblue\", width=1), opacity=0.7\n",
|
|
544
|
+
"))\n",
|
|
545
|
+
"\n",
|
|
546
|
+
"if trend_result.slope is not None:\n",
|
|
547
|
+
" x_numeric = (daily_data[TIME_COLUMN] - daily_data[TIME_COLUMN].min()).dt.days\n",
|
|
548
|
+
" y_trend = trend_result.slope * x_numeric + (daily_data[analysis_col].mean() - trend_result.slope * x_numeric.mean())\n",
|
|
549
|
+
" trend_color = {TrendDirection.INCREASING: \"green\", TrendDirection.DECREASING: \"red\"}.get(trend_result.direction, \"gray\")\n",
|
|
550
|
+
" fig.add_trace(go.Scatter(\n",
|
|
551
|
+
" x=daily_data[TIME_COLUMN], y=y_trend, mode=\"lines\",\n",
|
|
552
|
+
" name=f\"Trend ({trend_result.direction.value})\", line=dict(color=trend_color, width=3, dash=\"dash\")\n",
|
|
553
|
+
" ))\n",
|
|
554
|
+
"\n",
|
|
555
|
+
"rolling_avg = daily_data[analysis_col].rolling(window=pattern_config.rolling_window, center=True).mean()\n",
|
|
556
|
+
"fig.add_trace(go.Scatter(\n",
|
|
557
|
+
" x=daily_data[TIME_COLUMN], y=rolling_avg, mode=\"lines\",\n",
|
|
558
|
+
" name=f\"{pattern_config.rolling_window}-day Rolling Avg\", line=dict(color=\"orange\", width=2)\n",
|
|
559
|
+
"))\n",
|
|
560
|
+
"\n",
|
|
561
|
+
"fig.update_layout(\n",
|
|
562
|
+
" title=f\"Trend Analysis: {trend_result.direction.value.title()} (R²={trend_result.strength:.2f})\",\n",
|
|
563
|
+
" xaxis_title=\"Date\", yaxis_title=\"Value\", template=\"plotly_white\", height=400,\n",
|
|
564
|
+
" legend=dict(yanchor=\"top\", y=0.99, xanchor=\"left\", x=0.01)\n",
|
|
565
|
+
")\n",
|
|
566
|
+
"display_figure(fig)"
|
|
567
|
+
]
|
|
568
|
+
},
|
|
569
|
+
{
|
|
570
|
+
"cell_type": "code",
|
|
571
|
+
"execution_count": null,
|
|
572
|
+
"id": "1c5ac864",
|
|
573
|
+
"metadata": {
|
|
574
|
+
"execution": {
|
|
575
|
+
"iopub.execute_input": "2026-02-02T13:01:11.764953Z",
|
|
576
|
+
"iopub.status.busy": "2026-02-02T13:01:11.764827Z",
|
|
577
|
+
"iopub.status.idle": "2026-02-02T13:01:11.768446Z",
|
|
578
|
+
"shell.execute_reply": "2026-02-02T13:01:11.767984Z"
|
|
579
|
+
},
|
|
580
|
+
"papermill": {
|
|
581
|
+
"duration": 0.014974,
|
|
582
|
+
"end_time": "2026-02-02T13:01:11.768924",
|
|
583
|
+
"exception": false,
|
|
584
|
+
"start_time": "2026-02-02T13:01:11.753950",
|
|
585
|
+
"status": "completed"
|
|
586
|
+
},
|
|
587
|
+
"tags": []
|
|
588
|
+
},
|
|
589
|
+
"outputs": [],
|
|
590
|
+
"source": [
|
|
591
|
+
"# Trend details and recommendations\n",
|
|
592
|
+
"print(\"š TREND ANALYSIS DETAILS\")\n",
|
|
593
|
+
"print(\"=\"*50)\n",
|
|
594
|
+
"print(f\"\\n Direction: {trend_result.direction.value.upper()}\")\n",
|
|
595
|
+
"print(f\" Strength (R²): {trend_result.strength:.3f}\")\n",
|
|
596
|
+
"print(f\" Confidence: {trend_result.confidence.upper()}\")\n",
|
|
597
|
+
"\n",
|
|
598
|
+
"if trend_result.slope is not None:\n",
|
|
599
|
+
" mean_val = daily_data[analysis_col].mean()\n",
|
|
600
|
+
" daily_pct = (trend_result.slope / mean_val * 100) if mean_val else 0\n",
|
|
601
|
+
" print(f\" Slope: {trend_result.slope:.4f} per day ({daily_pct:+.3f}%/day)\")\n",
|
|
602
|
+
"if trend_result.p_value is not None:\n",
|
|
603
|
+
" print(f\" P-value: {trend_result.p_value:.4f}\")\n",
|
|
604
|
+
"\n",
|
|
605
|
+
"print(\"\\nš RECOMMENDATIONS:\")\n",
|
|
606
|
+
"for rec in trend_recs:\n",
|
|
607
|
+
" priority_icon = {\"high\": \"š“\", \"medium\": \"š”\", \"low\": \"š¢\"}.get(rec.priority, \"āŖ\")\n",
|
|
608
|
+
" print(f\" {priority_icon} [{rec.priority.upper()}] {rec.action}\")\n",
|
|
609
|
+
" print(f\" {rec.reason}\")\n",
|
|
610
|
+
" if rec.features:\n",
|
|
611
|
+
" print(f\" Features: {', '.join(rec.features)}\")\n",
|
|
612
|
+
"\n",
|
|
613
|
+
"TREND_RECOMMENDATIONS = [{\"action\": r.action, \"priority\": r.priority, \"reason\": r.reason, \n",
|
|
614
|
+
" \"features\": r.features} for r in trend_recs]"
|
|
615
|
+
]
|
|
616
|
+
},
|
|
617
|
+
{
|
|
618
|
+
"cell_type": "markdown",
|
|
619
|
+
"id": "8f32dd60",
|
|
620
|
+
"metadata": {
|
|
621
|
+
"papermill": {
|
|
622
|
+
"duration": 0.009373,
|
|
623
|
+
"end_time": "2026-02-02T13:01:11.788414",
|
|
624
|
+
"exception": false,
|
|
625
|
+
"start_time": "2026-02-02T13:01:11.779041",
|
|
626
|
+
"status": "completed"
|
|
627
|
+
},
|
|
628
|
+
"tags": []
|
|
629
|
+
},
|
|
630
|
+
"source": [
|
|
631
|
+
"## 1c.6 Seasonality Detection\n",
|
|
632
|
+
"\n",
|
|
633
|
+
"**š Understanding Seasonality:**\n",
|
|
634
|
+
"- **Weekly** (period=7): Higher activity on certain days\n",
|
|
635
|
+
"- **Monthly** (period~30): End-of-month patterns, billing cycles\n",
|
|
636
|
+
"- **Quarterly** (period~90): Business cycles, seasonal products\n",
|
|
637
|
+
"\n",
|
|
638
|
+
"**š Interpreting Strength (Autocorrelation):**\n",
|
|
639
|
+
"\n",
|
|
640
|
+
"Strength measures how well values at a given lag correlate with current values.\n",
|
|
641
|
+
"\n",
|
|
642
|
+
"| Strength | Interpretation | Random Data Baseline |\n",
|
|
643
|
+
"|----------|----------------|---------------------|\n",
|
|
644
|
+
"| 0.0 | No pattern (random noise) | ā 0.0 |\n",
|
|
645
|
+
"| 0.1ā0.3 | Weak pattern | Barely above random |\n",
|
|
646
|
+
"| 0.3ā0.5 | Moderate pattern | 3ā5Ć lift over random |\n",
|
|
647
|
+
"| 0.5ā0.7 | Strong pattern | Clear repeating cycle |\n",
|
|
648
|
+
"| > 0.7 | Very strong pattern | Near-deterministic cycle |\n",
|
|
649
|
+
"\n",
|
|
650
|
+
"*Lift interpretation: A strength of 0.4 means the pattern explains ~40% of variance at that lag, vs ~0% for random data.*\n",
|
|
651
|
+
"\n",
|
|
652
|
+
"**šÆ Window-Aligned Pattern Detection:**\n",
|
|
653
|
+
"\n",
|
|
654
|
+
"We check two types of patterns:\n",
|
|
655
|
+
"- **Natural periods** (7, 14, 21, 30 days): Calendar-driven cycles\n",
|
|
656
|
+
"- **Aggregation windows** (from findings): Patterns at your selected feature windows (e.g., 180d, 365d)\n",
|
|
657
|
+
"\n",
|
|
658
|
+
"If a pattern aligns with your aggregation window, features computed over that window may capture the full cycle ā consider this when interpreting aggregated features.\n",
|
|
659
|
+
"\n",
|
|
660
|
+
"**Impact on ML:**\n",
|
|
661
|
+
"- Add day-of-week, month features for detected periods\n",
|
|
662
|
+
"- Consider seasonal decomposition for strong patterns\n",
|
|
663
|
+
"- Use cyclical encodings (sin/cos) for neural networks"
|
|
664
|
+
]
|
|
665
|
+
},
|
|
666
|
+
{
|
|
667
|
+
"cell_type": "code",
|
|
668
|
+
"execution_count": null,
|
|
669
|
+
"id": "6daf881c",
|
|
670
|
+
"metadata": {
|
|
671
|
+
"execution": {
|
|
672
|
+
"iopub.execute_input": "2026-02-02T13:01:11.808863Z",
|
|
673
|
+
"iopub.status.busy": "2026-02-02T13:01:11.808744Z",
|
|
674
|
+
"iopub.status.idle": "2026-02-02T13:01:11.863490Z",
|
|
675
|
+
"shell.execute_reply": "2026-02-02T13:01:11.862982Z"
|
|
676
|
+
},
|
|
677
|
+
"papermill": {
|
|
678
|
+
"duration": 0.06617,
|
|
679
|
+
"end_time": "2026-02-02T13:01:11.864375",
|
|
680
|
+
"exception": false,
|
|
681
|
+
"start_time": "2026-02-02T13:01:11.798205",
|
|
682
|
+
"status": "completed"
|
|
683
|
+
},
|
|
684
|
+
"tags": []
|
|
685
|
+
},
|
|
686
|
+
"outputs": [],
|
|
687
|
+
"source": [
|
|
688
|
+
"# Seasonality Analysis - Temporal Pattern Grid + Autocorrelation\n",
|
|
689
|
+
"from plotly.subplots import make_subplots\n",
|
|
690
|
+
"\n",
|
|
691
|
+
"# Prepare temporal columns\n",
|
|
692
|
+
"daily_data[\"day_of_week\"] = daily_data[TIME_COLUMN].dt.day_name()\n",
|
|
693
|
+
"daily_data[\"month\"] = daily_data[TIME_COLUMN].dt.month_name()\n",
|
|
694
|
+
"daily_data[\"quarter\"] = \"Q\" + daily_data[TIME_COLUMN].dt.quarter.astype(str)\n",
|
|
695
|
+
"daily_data[\"year\"] = daily_data[TIME_COLUMN].dt.year.astype(str)\n",
|
|
696
|
+
"\n",
|
|
697
|
+
"dow_order = [\"Monday\", \"Tuesday\", \"Wednesday\", \"Thursday\", \"Friday\", \"Saturday\", \"Sunday\"]\n",
|
|
698
|
+
"month_order = [\"January\", \"February\", \"March\", \"April\", \"May\", \"June\", \"July\", \"August\", \"September\", \"October\", \"November\", \"December\"]\n",
|
|
699
|
+
"daily_data[\"day_of_week\"] = pd.Categorical(daily_data[\"day_of_week\"], categories=[d for d in dow_order if d in daily_data[\"day_of_week\"].values], ordered=True)\n",
|
|
700
|
+
"daily_data[\"month\"] = pd.Categorical(daily_data[\"month\"], categories=[m for m in month_order if m in daily_data[\"month\"].values], ordered=True)\n",
|
|
701
|
+
"daily_data[\"quarter\"] = pd.Categorical(daily_data[\"quarter\"], categories=[q for q in [\"Q1\",\"Q2\",\"Q3\",\"Q4\"] if q in daily_data[\"quarter\"].values], ordered=True)\n",
|
|
702
|
+
"\n",
|
|
703
|
+
"# Compute statistics\n",
|
|
704
|
+
"dow_stats = daily_data.groupby(\"day_of_week\", observed=True)[analysis_col].agg([\"mean\", \"std\"]).reset_index()\n",
|
|
705
|
+
"monthly_stats = daily_data.groupby(\"month\", observed=True)[analysis_col].agg([\"mean\", \"std\"]).reset_index()\n",
|
|
706
|
+
"quarterly_stats = daily_data.groupby(\"quarter\", observed=True)[analysis_col].agg([\"mean\", \"std\"]).reset_index()\n",
|
|
707
|
+
"yearly_stats = daily_data.groupby(\"year\", observed=True)[analysis_col].agg([\"mean\", \"std\"]).reset_index()\n",
|
|
708
|
+
"overall_mean = daily_data[analysis_col].mean()\n",
|
|
709
|
+
"\n",
|
|
710
|
+
"# Get aggregation window lags for seasonality detection\n",
|
|
711
|
+
"window_lags = []\n",
|
|
712
|
+
"if findings.time_series_metadata and findings.time_series_metadata.suggested_aggregations:\n",
|
|
713
|
+
" for w in findings.time_series_metadata.suggested_aggregations:\n",
|
|
714
|
+
" if w != \"all_time\":\n",
|
|
715
|
+
" days = int(w.replace(\"d\", \"\").replace(\"h\", \"\")) if \"d\" in w else int(w.replace(\"h\", \"\")) // 24\n",
|
|
716
|
+
" if days > 30:\n",
|
|
717
|
+
" window_lags.append(days)\n",
|
|
718
|
+
"\n",
|
|
719
|
+
"# Run seasonality detection\n",
|
|
720
|
+
"seasonality_results = analyzer.detect_seasonality(daily_data, value_column=analysis_col, additional_lags=window_lags)\n",
|
|
721
|
+
"\n",
|
|
722
|
+
"# Create 2x2 visualization grid\n",
|
|
723
|
+
"fig = make_subplots(rows=2, cols=2, subplot_titles=[\"Day of Week\", \"Monthly\", \"Quarterly\", \"Yearly\"],\n",
|
|
724
|
+
" horizontal_spacing=0.1, vertical_spacing=0.12)\n",
|
|
725
|
+
"\n",
|
|
726
|
+
"colors_dow = [\"lightgray\" if d in [\"Saturday\", \"Sunday\"] else \"steelblue\" for d in dow_stats[\"day_of_week\"]]\n",
|
|
727
|
+
"fig.add_trace(go.Bar(x=dow_stats[\"day_of_week\"], y=dow_stats[\"mean\"], error_y=dict(type=\"data\", array=dow_stats[\"std\"]),\n",
|
|
728
|
+
" marker_color=colors_dow, showlegend=False), row=1, col=1)\n",
|
|
729
|
+
"fig.add_trace(go.Bar(x=monthly_stats[\"month\"], y=monthly_stats[\"mean\"], error_y=dict(type=\"data\", array=monthly_stats[\"std\"]),\n",
|
|
730
|
+
" marker_color=\"mediumpurple\", showlegend=False), row=1, col=2)\n",
|
|
731
|
+
"fig.add_trace(go.Bar(x=quarterly_stats[\"quarter\"], y=quarterly_stats[\"mean\"], error_y=dict(type=\"data\", array=quarterly_stats[\"std\"]),\n",
|
|
732
|
+
" marker_color=\"teal\", showlegend=False), row=2, col=1)\n",
|
|
733
|
+
"fig.add_trace(go.Bar(x=yearly_stats[\"year\"], y=yearly_stats[\"mean\"], error_y=dict(type=\"data\", array=yearly_stats[\"std\"]),\n",
|
|
734
|
+
" marker_color=\"coral\", showlegend=False), row=2, col=2)\n",
|
|
735
|
+
"\n",
|
|
736
|
+
"for row, col in [(1, 1), (1, 2), (2, 1), (2, 2)]:\n",
|
|
737
|
+
" fig.add_hline(y=overall_mean, line_dash=\"dot\", line_color=\"red\", opacity=0.5, row=row, col=col)\n",
|
|
738
|
+
"\n",
|
|
739
|
+
"fig.update_layout(title={\"text\": \"š
Temporal Pattern Analysis<br><sup>Gray = weekends | Red line = overall mean</sup>\",\n",
|
|
740
|
+
" \"x\": 0.5, \"xanchor\": \"center\"}, template=\"plotly_white\", height=700)\n",
|
|
741
|
+
"fig.update_yaxes(title_text=\"Avg Value\", row=1, col=1)\n",
|
|
742
|
+
"fig.update_yaxes(title_text=\"Avg Value\", row=2, col=1)\n",
|
|
743
|
+
"display_figure(fig)\n",
|
|
744
|
+
"\n",
|
|
745
|
+
"# Combined Pattern Analysis\n",
|
|
746
|
+
"print(\"š SEASONALITY & TEMPORAL PATTERN ANALYSIS\")\n",
|
|
747
|
+
"print(\"=\"*60)\n",
|
|
748
|
+
"\n",
|
|
749
|
+
"# Variation analysis\n",
|
|
750
|
+
"def calc_var(stats): return (stats[\"mean\"].max() - stats[\"mean\"].min()) / overall_mean * 100 if len(stats) > 1 else 0\n",
|
|
751
|
+
"variations = {\"day_of_week\": calc_var(dow_stats), \"month\": calc_var(monthly_stats), \n",
|
|
752
|
+
" \"quarter\": calc_var(quarterly_stats), \"year\": calc_var(yearly_stats)}\n",
|
|
753
|
+
"\n",
|
|
754
|
+
"print(f\"\\nš Pattern Variation (% from mean):\")\n",
|
|
755
|
+
"print(f\" Day of Week: {variations['day_of_week']:.1f}%\")\n",
|
|
756
|
+
"print(f\" Monthly: {variations['month']:.1f}%\")\n",
|
|
757
|
+
"print(f\" Quarterly: {variations['quarter']:.1f}%\")\n",
|
|
758
|
+
"print(f\" Yearly: {variations['year']:.1f}%\")\n",
|
|
759
|
+
"\n",
|
|
760
|
+
"# Autocorrelation seasonality\n",
|
|
761
|
+
"print(f\"\\nš Autocorrelation Seasonality (threshold > 0.3):\")\n",
|
|
762
|
+
"if seasonality_results:\n",
|
|
763
|
+
" for sr in seasonality_results:\n",
|
|
764
|
+
" strength = \"Strong\" if sr.strength > 0.5 else \"Moderate\"\n",
|
|
765
|
+
" aligned = \" [aggregation window]\" if sr.period in window_lags else \"\"\n",
|
|
766
|
+
" print(f\" ⢠{sr.period_name or f'{sr.period}d'}: {sr.strength:.3f} ({strength}){aligned}\")\n",
|
|
767
|
+
"else:\n",
|
|
768
|
+
" print(\" No significant autocorrelation patterns detected\")\n",
|
|
769
|
+
"\n",
|
|
770
|
+
"# Generate recommendations\n",
|
|
771
|
+
"SEASONALITY_RECOMMENDATIONS = []\n",
|
|
772
|
+
"for pattern, var_pct in variations.items():\n",
|
|
773
|
+
" priority = \"high\" if var_pct > 20 else \"medium\" if var_pct > 10 else \"low\"\n",
|
|
774
|
+
" \n",
|
|
775
|
+
" if pattern == \"day_of_week\" and var_pct > 10:\n",
|
|
776
|
+
" SEASONALITY_RECOMMENDATIONS.append({\"pattern\": pattern, \"variation\": var_pct, \"priority\": priority,\n",
|
|
777
|
+
" \"features\": [\"dow_sin\", \"dow_cos\", \"is_weekend\"], \"reason\": f\"{var_pct:.1f}% variation - add cyclical encoding\"})\n",
|
|
778
|
+
" elif pattern == \"month\" and var_pct > 10:\n",
|
|
779
|
+
" SEASONALITY_RECOMMENDATIONS.append({\"pattern\": pattern, \"variation\": var_pct, \"priority\": priority,\n",
|
|
780
|
+
" \"features\": [\"month_sin\", \"month_cos\"], \"reason\": f\"{var_pct:.1f}% variation - add cyclical encoding\"})\n",
|
|
781
|
+
" elif pattern == \"quarter\" and var_pct > 10:\n",
|
|
782
|
+
" SEASONALITY_RECOMMENDATIONS.append({\"pattern\": pattern, \"variation\": var_pct, \"priority\": priority,\n",
|
|
783
|
+
" \"features\": [\"quarter_sin\", \"quarter_cos\"], \"reason\": f\"{var_pct:.1f}% variation - add cyclical encoding\"})\n",
|
|
784
|
+
" elif pattern == \"year\" and var_pct > 20:\n",
|
|
785
|
+
" trend_explains = 'trend_result' in dir() and trend_result.strength > 0.3 and trend_result.has_direction\n",
|
|
786
|
+
" if trend_explains:\n",
|
|
787
|
+
" SEASONALITY_RECOMMENDATIONS.append({\"pattern\": pattern, \"variation\": var_pct, \"priority\": priority,\n",
|
|
788
|
+
" \"features\": [\"year_trend\"], \"reason\": f\"{var_pct:.1f}% variation aligned with trend\"})\n",
|
|
789
|
+
" else:\n",
|
|
790
|
+
" SEASONALITY_RECOMMENDATIONS.append({\"pattern\": pattern, \"variation\": var_pct, \"priority\": priority,\n",
|
|
791
|
+
" \"features\": [\"year_categorical\"], \"reason\": f\"{var_pct:.1f}% variation but NO linear trend - use categorical\",\n",
|
|
792
|
+
" \"warning\": \"Stepwise changes or non-linear cycles suspected\"})\n",
|
|
793
|
+
"\n",
|
|
794
|
+
"# For autocorrelation-detected patterns\n",
|
|
795
|
+
"for sr in seasonality_results:\n",
|
|
796
|
+
" if sr.period in [7, 14, 21, 30] and sr.strength > 0.3:\n",
|
|
797
|
+
" SEASONALITY_RECOMMENDATIONS.append({\"pattern\": f\"{sr.period}d_cycle\", \"variation\": sr.strength * 100, \n",
|
|
798
|
+
" \"priority\": \"medium\", \"features\": [f\"lag_{sr.period}d_ratio\"], \n",
|
|
799
|
+
" \"reason\": f\"Autocorrelation {sr.strength:.2f} at {sr.period}d - add lag ratio feature\"})\n",
|
|
800
|
+
"\n",
|
|
801
|
+
"print(\"\\n\" + \"ā\"*60)\n",
|
|
802
|
+
"print(\"š SEASONALITY RECOMMENDATIONS:\")\n",
|
|
803
|
+
"print(\"ā\"*60)\n",
|
|
804
|
+
"if SEASONALITY_RECOMMENDATIONS:\n",
|
|
805
|
+
" for rec in SEASONALITY_RECOMMENDATIONS:\n",
|
|
806
|
+
" icon = {\"high\": \"š“\", \"medium\": \"š”\", \"low\": \"š¢\"}.get(rec[\"priority\"], \"āŖ\")\n",
|
|
807
|
+
" print(f\"\\n{icon} [{rec['priority'].upper()}] {rec['pattern'].replace('_', ' ').title()}\")\n",
|
|
808
|
+
" print(f\" {rec['reason']}\")\n",
|
|
809
|
+
" if rec.get(\"warning\"):\n",
|
|
810
|
+
" print(f\" ā ļø {rec['warning']}\")\n",
|
|
811
|
+
" if rec.get(\"features\"):\n",
|
|
812
|
+
" print(f\" ā Features: {', '.join(rec['features'])}\")\n",
|
|
813
|
+
"else:\n",
|
|
814
|
+
" print(\"\\n No significant patterns - seasonal features unlikely to help\")\n",
|
|
815
|
+
"\n",
|
|
816
|
+
"TEMPORAL_PATTERN_RECOMMENDATIONS = SEASONALITY_RECOMMENDATIONS"
|
|
817
|
+
]
|
|
818
|
+
},
|
|
819
|
+
{
|
|
820
|
+
"cell_type": "markdown",
|
|
821
|
+
"id": "beb4cf26",
|
|
822
|
+
"metadata": {
|
|
823
|
+
"papermill": {
|
|
824
|
+
"duration": 0.012246,
|
|
825
|
+
"end_time": "2026-02-02T13:01:11.888736",
|
|
826
|
+
"exception": false,
|
|
827
|
+
"start_time": "2026-02-02T13:01:11.876490",
|
|
828
|
+
"status": "completed"
|
|
829
|
+
},
|
|
830
|
+
"tags": []
|
|
831
|
+
},
|
|
832
|
+
"source": [
|
|
833
|
+
"## 1c.7 Cohort Analysis\n",
|
|
834
|
+
"\n",
|
|
835
|
+
"**š Understanding Cohorts:**\n",
|
|
836
|
+
"- Group entities by when they first appeared (signup cohort)\n",
|
|
837
|
+
"- Compare behavior across cohorts\n",
|
|
838
|
+
"- Identify if acquisition quality changed over time\n",
|
|
839
|
+
"\n",
|
|
840
|
+
"**Cohorts vs Segments:** Cohorts are time-bound groups (when entities joined), while segments are attribute-based groups (what entities are). Cohorts are fixed at signup; segments can change over time.\n",
|
|
841
|
+
"\n",
|
|
842
|
+
"**Other time-based cohort ideas:**\n",
|
|
843
|
+
"- First purchase date (not just signup)\n",
|
|
844
|
+
"- First feature usage (e.g., \"first mobile app use\")\n",
|
|
845
|
+
"- Campaign/promotion exposure date\n",
|
|
846
|
+
"- Onboarding completion date\n",
|
|
847
|
+
"- Product version or pricing plan at signup time\n",
|
|
848
|
+
"\n",
|
|
849
|
+
"These can be derived as custom features if your data contains the relevant timestamps."
|
|
850
|
+
]
|
|
851
|
+
},
|
|
852
|
+
{
|
|
853
|
+
"cell_type": "code",
|
|
854
|
+
"execution_count": null,
|
|
855
|
+
"id": "97db3bdb",
|
|
856
|
+
"metadata": {
|
|
857
|
+
"execution": {
|
|
858
|
+
"iopub.execute_input": "2026-02-02T13:01:11.914469Z",
|
|
859
|
+
"iopub.status.busy": "2026-02-02T13:01:11.914351Z",
|
|
860
|
+
"iopub.status.idle": "2026-02-02T13:01:11.976413Z",
|
|
861
|
+
"shell.execute_reply": "2026-02-02T13:01:11.975921Z"
|
|
862
|
+
},
|
|
863
|
+
"papermill": {
|
|
864
|
+
"duration": 0.076105,
|
|
865
|
+
"end_time": "2026-02-02T13:01:11.977184",
|
|
866
|
+
"exception": false,
|
|
867
|
+
"start_time": "2026-02-02T13:01:11.901079",
|
|
868
|
+
"status": "completed"
|
|
869
|
+
},
|
|
870
|
+
"tags": []
|
|
871
|
+
},
|
|
872
|
+
"outputs": [],
|
|
873
|
+
"source": [
|
|
874
|
+
"# Cohort Analysis - computation and visualization\n",
|
|
875
|
+
"from customer_retention.stages.profiling import analyze_cohort_distribution, generate_cohort_recommendations\n",
|
|
876
|
+
"\n",
|
|
877
|
+
"COHORT_RECOMMENDATIONS = []\n",
|
|
878
|
+
"cohort_dist = None\n",
|
|
879
|
+
"\n",
|
|
880
|
+
"if ENTITY_COLUMN:\n",
|
|
881
|
+
" first_events = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].min().reset_index()\n",
|
|
882
|
+
" first_events.columns = [ENTITY_COLUMN, \"first_event\"]\n",
|
|
883
|
+
" cohort_dist = analyze_cohort_distribution(first_events, \"first_event\")\n",
|
|
884
|
+
" \n",
|
|
885
|
+
" cohort_result = analyzer.analyze_cohorts(\n",
|
|
886
|
+
" df, entity_column=ENTITY_COLUMN, cohort_column=TIME_COLUMN,\n",
|
|
887
|
+
" target_column=TARGET_COLUMN, period=\"M\"\n",
|
|
888
|
+
" )\n",
|
|
889
|
+
" \n",
|
|
890
|
+
" print(\"š„ COHORT ANALYSIS\")\n",
|
|
891
|
+
" print(\"=\"*50)\n",
|
|
892
|
+
" print(f\"\\nEntity Onboarding: {cohort_dist.dominant_pct:.0f}% in {cohort_dist.dominant_year}, {cohort_dist.num_years} years total\")\n",
|
|
893
|
+
" \n",
|
|
894
|
+
" if len(cohort_result) > 0:\n",
|
|
895
|
+
" cohort_sorted = cohort_result.sort_values(\"cohort\")\n",
|
|
896
|
+
" has_retention = \"retention_rate\" in cohort_sorted.columns\n",
|
|
897
|
+
" \n",
|
|
898
|
+
" fig = make_subplots(specs=[[{\"secondary_y\": True}]]) if has_retention else go.Figure()\n",
|
|
899
|
+
" \n",
|
|
900
|
+
" fig.add_trace(go.Bar(\n",
|
|
901
|
+
" x=cohort_sorted[\"cohort\"].astype(str), y=cohort_sorted[\"entity_count\"],\n",
|
|
902
|
+
" name=\"Entities (sign-up cohort)\", marker_color=\"steelblue\", opacity=0.7\n",
|
|
903
|
+
" ), secondary_y=False) if has_retention else fig.add_trace(go.Bar(\n",
|
|
904
|
+
" x=cohort_sorted[\"cohort\"].astype(str), y=cohort_sorted[\"entity_count\"],\n",
|
|
905
|
+
" name=\"Entities (sign-up cohort)\", marker_color=\"steelblue\", opacity=0.7\n",
|
|
906
|
+
" ))\n",
|
|
907
|
+
" \n",
|
|
908
|
+
" if has_retention:\n",
|
|
909
|
+
" fig.add_trace(go.Scatter(\n",
|
|
910
|
+
" x=cohort_sorted[\"cohort\"].astype(str), y=cohort_sorted[\"retention_rate\"] * 100,\n",
|
|
911
|
+
" mode=\"lines+markers\", name=\"Retention Rate %\",\n",
|
|
912
|
+
" line=dict(color=\"coral\", width=3), marker=dict(size=8)\n",
|
|
913
|
+
" ), secondary_y=True)\n",
|
|
914
|
+
" fig.update_yaxes(title_text=\"Retention Rate %\", secondary_y=True)\n",
|
|
915
|
+
" \n",
|
|
916
|
+
" fig.update_layout(\n",
|
|
917
|
+
" title=\"Cohort Analysis: Entity Count by Sign-up Month (cohort = first event period)\",\n",
|
|
918
|
+
" xaxis_title=\"Cohort (First Event Month)\", template=\"plotly_white\", height=400\n",
|
|
919
|
+
" )\n",
|
|
920
|
+
" fig.update_yaxes(title_text=\"Entity Count\", secondary_y=False) if has_retention else fig.update_yaxes(title_text=\"Entity Count\")\n",
|
|
921
|
+
" display_figure(fig)"
|
|
922
|
+
]
|
|
923
|
+
},
|
|
924
|
+
{
|
|
925
|
+
"cell_type": "code",
|
|
926
|
+
"execution_count": null,
|
|
927
|
+
"id": "95eeb3e7",
|
|
928
|
+
"metadata": {
|
|
929
|
+
"execution": {
|
|
930
|
+
"iopub.execute_input": "2026-02-02T13:01:12.006223Z",
|
|
931
|
+
"iopub.status.busy": "2026-02-02T13:01:12.006075Z",
|
|
932
|
+
"iopub.status.idle": "2026-02-02T13:01:12.013450Z",
|
|
933
|
+
"shell.execute_reply": "2026-02-02T13:01:12.012005Z"
|
|
934
|
+
},
|
|
935
|
+
"papermill": {
|
|
936
|
+
"duration": 0.022947,
|
|
937
|
+
"end_time": "2026-02-02T13:01:12.014216",
|
|
938
|
+
"exception": false,
|
|
939
|
+
"start_time": "2026-02-02T13:01:11.991269",
|
|
940
|
+
"status": "completed"
|
|
941
|
+
},
|
|
942
|
+
"tags": []
|
|
943
|
+
},
|
|
944
|
+
"outputs": [],
|
|
945
|
+
"source": [
|
|
946
|
+
"# Cohort details and recommendations\n",
|
|
947
|
+
"if ENTITY_COLUMN and cohort_dist:\n",
|
|
948
|
+
" retention_var = None\n",
|
|
949
|
+
" if \"retention_rate\" in cohort_result.columns:\n",
|
|
950
|
+
" retention_var = cohort_result[\"retention_rate\"].max() - cohort_result[\"retention_rate\"].min()\n",
|
|
951
|
+
" \n",
|
|
952
|
+
" cohort_recs = generate_cohort_recommendations(cohort_dist, retention_variation=retention_var)\n",
|
|
953
|
+
" \n",
|
|
954
|
+
" print(\"š COHORT DETAILS\")\n",
|
|
955
|
+
" print(\"=\"*50)\n",
|
|
956
|
+
" print(f\"\\nEntity Onboarding Distribution by Year:\")\n",
|
|
957
|
+
" print(\"ā\" * 40)\n",
|
|
958
|
+
" for year, count in sorted(cohort_dist.year_counts.items()):\n",
|
|
959
|
+
" pct = count / cohort_dist.total_entities * 100\n",
|
|
960
|
+
" bar = \"ā\" * int(pct / 3)\n",
|
|
961
|
+
" print(f\" {year}: {count:>5,} entities ({pct:>5.1f}%) {bar}\")\n",
|
|
962
|
+
" \n",
|
|
963
|
+
" print(f\"\\n Total entities: {cohort_dist.total_entities:,}\")\n",
|
|
964
|
+
" print(f\" Data spans: {df[TIME_COLUMN].min().date()} to {df[TIME_COLUMN].max().date()}\")\n",
|
|
965
|
+
" \n",
|
|
966
|
+
" print(\"\\nš RECOMMENDATIONS:\")\n",
|
|
967
|
+
" for rec in cohort_recs:\n",
|
|
968
|
+
" priority_icon = {\"high\": \"š“\", \"medium\": \"š”\", \"low\": \"š¢\"}.get(rec.priority, \"āŖ\")\n",
|
|
969
|
+
" print(f\" {priority_icon} [{rec.priority.upper()}] {rec.action}\")\n",
|
|
970
|
+
" print(f\" {rec.reason}\")\n",
|
|
971
|
+
" if rec.features:\n",
|
|
972
|
+
" print(f\" Features: {', '.join(rec.features)}\")\n",
|
|
973
|
+
" if rec.insight:\n",
|
|
974
|
+
" print(f\" š” {rec.insight}\")\n",
|
|
975
|
+
" \n",
|
|
976
|
+
" COHORT_RECOMMENDATIONS = [{\"action\": r.action, \"priority\": r.priority, \"reason\": r.reason,\n",
|
|
977
|
+
" \"features\": getattr(r, 'features', []), \n",
|
|
978
|
+
" \"insight\": getattr(r, 'insight', None)} for r in cohort_recs]"
|
|
979
|
+
]
|
|
980
|
+
},
|
|
981
|
+
{
|
|
982
|
+
"cell_type": "markdown",
|
|
983
|
+
"id": "0f0f0294",
|
|
984
|
+
"metadata": {
|
|
985
|
+
"papermill": {
|
|
986
|
+
"duration": 0.037269,
|
|
987
|
+
"end_time": "2026-02-02T13:01:12.065822",
|
|
988
|
+
"exception": false,
|
|
989
|
+
"start_time": "2026-02-02T13:01:12.028553",
|
|
990
|
+
"status": "completed"
|
|
991
|
+
},
|
|
992
|
+
"tags": []
|
|
993
|
+
},
|
|
994
|
+
"source": [
|
|
995
|
+
"## 1c.8 Correlation Matrix Analysis\n",
|
|
996
|
+
"\n",
|
|
997
|
+
"**š Understanding Feature Relationships:**\n",
|
|
998
|
+
"\n",
|
|
999
|
+
"This section shows feature-feature relationships in two complementary ways:\n",
|
|
1000
|
+
"- **Correlation Matrix**: Numerical summary (r values)\n",
|
|
1001
|
+
"- **Scatter Matrix**: Visual relationships with cohort overlay\n",
|
|
1002
|
+
"\n",
|
|
1003
|
+
"| Correlation | Interpretation | Action |\n",
|
|
1004
|
+
"|-------------|----------------|--------|\n",
|
|
1005
|
+
"| `\\|r\\|` > 0.9 | Near-duplicate features | Remove one |\n",
|
|
1006
|
+
"| `\\|r\\|` > 0.7 | Strong relationship | Consider combining |\n",
|
|
1007
|
+
"| `\\|r\\|` < 0.3 | Weak/no relationship | Independent features |"
|
|
1008
|
+
]
|
|
1009
|
+
},
|
|
1010
|
+
{
|
|
1011
|
+
"cell_type": "code",
|
|
1012
|
+
"execution_count": null,
|
|
1013
|
+
"id": "65121cc6",
|
|
1014
|
+
"metadata": {
|
|
1015
|
+
"execution": {
|
|
1016
|
+
"iopub.execute_input": "2026-02-02T13:01:12.100581Z",
|
|
1017
|
+
"iopub.status.busy": "2026-02-02T13:01:12.100463Z",
|
|
1018
|
+
"iopub.status.idle": "2026-02-02T13:01:12.118903Z",
|
|
1019
|
+
"shell.execute_reply": "2026-02-02T13:01:12.118553Z"
|
|
1020
|
+
},
|
|
1021
|
+
"papermill": {
|
|
1022
|
+
"duration": 0.034505,
|
|
1023
|
+
"end_time": "2026-02-02T13:01:12.119780",
|
|
1024
|
+
"exception": false,
|
|
1025
|
+
"start_time": "2026-02-02T13:01:12.085275",
|
|
1026
|
+
"status": "completed"
|
|
1027
|
+
},
|
|
1028
|
+
"tags": []
|
|
1029
|
+
},
|
|
1030
|
+
"outputs": [],
|
|
1031
|
+
"source": [
|
|
1032
|
+
"# Correlation matrix for numeric event attributes\n",
|
|
1033
|
+
"# Define analysis columns - exclude entity, time, target, and temporal metadata\n",
|
|
1034
|
+
"numeric_event_cols = [c for c in df.select_dtypes(include=[np.number]).columns \n",
|
|
1035
|
+
" if c not in [ENTITY_COLUMN, TIME_COLUMN, TARGET_COLUMN]\n",
|
|
1036
|
+
" and c not in TEMPORAL_METADATA_COLS\n",
|
|
1037
|
+
" and 'target' not in c.lower()]\n",
|
|
1038
|
+
"\n",
|
|
1039
|
+
"excluded_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in numeric_event_cols]\n",
|
|
1040
|
+
"\n",
|
|
1041
|
+
"print(f\"Correlation Analysis (event-level, n={len(df):,})\")\n",
|
|
1042
|
+
"print(f\" Included ({len(numeric_event_cols)}): {numeric_event_cols}\")\n",
|
|
1043
|
+
"print(f\" Excluded ({len(excluded_cols)}): {excluded_cols}\")\n",
|
|
1044
|
+
"\n",
|
|
1045
|
+
"if len(numeric_event_cols) >= 2:\n",
|
|
1046
|
+
" corr_matrix = df[numeric_event_cols].corr()\n",
|
|
1047
|
+
" fig = charts.heatmap(\n",
|
|
1048
|
+
" corr_matrix.values, x_labels=numeric_event_cols, y_labels=numeric_event_cols,\n",
|
|
1049
|
+
" title=\"Feature Correlation Matrix (Event-Level)\"\n",
|
|
1050
|
+
" )\n",
|
|
1051
|
+
" display_figure(fig)\n",
|
|
1052
|
+
" \n",
|
|
1053
|
+
" # High correlation pairs\n",
|
|
1054
|
+
" high_corr = []\n",
|
|
1055
|
+
" for i in range(len(numeric_event_cols)):\n",
|
|
1056
|
+
" for j in range(i+1, len(numeric_event_cols)):\n",
|
|
1057
|
+
" corr_val = corr_matrix.iloc[i, j]\n",
|
|
1058
|
+
" if abs(corr_val) > 0.7:\n",
|
|
1059
|
+
" high_corr.append((numeric_event_cols[i], numeric_event_cols[j], corr_val))\n",
|
|
1060
|
+
" \n",
|
|
1061
|
+
" if high_corr:\n",
|
|
1062
|
+
" print(\"\\nā ļø Highly correlated pairs (|r| > 0.7):\")\n",
|
|
1063
|
+
" for c1, c2, r in sorted(high_corr, key=lambda x: abs(x[2]), reverse=True)[:5]:\n",
|
|
1064
|
+
" print(f\" {c1} ā {c2}: r={r:.2f}\")\n"
|
|
1065
|
+
]
|
|
1066
|
+
},
|
|
1067
|
+
{
|
|
1068
|
+
"cell_type": "code",
|
|
1069
|
+
"execution_count": null,
|
|
1070
|
+
"id": "1b884449",
|
|
1071
|
+
"metadata": {
|
|
1072
|
+
"execution": {
|
|
1073
|
+
"iopub.execute_input": "2026-02-02T13:01:12.164433Z",
|
|
1074
|
+
"iopub.status.busy": "2026-02-02T13:01:12.164116Z",
|
|
1075
|
+
"iopub.status.idle": "2026-02-02T13:01:12.287514Z",
|
|
1076
|
+
"shell.execute_reply": "2026-02-02T13:01:12.287030Z"
|
|
1077
|
+
},
|
|
1078
|
+
"papermill": {
|
|
1079
|
+
"duration": 0.153277,
|
|
1080
|
+
"end_time": "2026-02-02T13:01:12.288406",
|
|
1081
|
+
"exception": false,
|
|
1082
|
+
"start_time": "2026-02-02T13:01:12.135129",
|
|
1083
|
+
"status": "completed"
|
|
1084
|
+
},
|
|
1085
|
+
"tags": []
|
|
1086
|
+
},
|
|
1087
|
+
"outputs": [],
|
|
1088
|
+
"source": [
|
|
1089
|
+
"# Scatter Matrix: Entity-level features (mixed aggregation types)\n",
|
|
1090
|
+
"if len(numeric_event_cols) >= 2 and ENTITY_COLUMN and TARGET_COLUMN and TARGET_COLUMN in df.columns:\n",
|
|
1091
|
+
" # Create entity-level aggregations (mean, sum, std) - like original\n",
|
|
1092
|
+
" agg_dict = {col: ['mean', 'sum', 'std'] for col in numeric_event_cols}\n",
|
|
1093
|
+
" entity_aggs = df.groupby(ENTITY_COLUMN).agg(agg_dict)\n",
|
|
1094
|
+
" entity_aggs.columns = ['_'.join(col).strip() for col in entity_aggs.columns]\n",
|
|
1095
|
+
" entity_aggs = entity_aggs.reset_index()\n",
|
|
1096
|
+
" \n",
|
|
1097
|
+
" # Get all numeric aggregated columns\n",
|
|
1098
|
+
" all_agg_cols = [c for c in entity_aggs.columns if c != ENTITY_COLUMN]\n",
|
|
1099
|
+
" \n",
|
|
1100
|
+
" # Select top 4 by variance across ALL aggregation types\n",
|
|
1101
|
+
" variances = entity_aggs[all_agg_cols].var().sort_values(ascending=False)\n",
|
|
1102
|
+
" top_features = variances.head(4).index.tolist()\n",
|
|
1103
|
+
" \n",
|
|
1104
|
+
" # Sample if needed\n",
|
|
1105
|
+
" sample_size = min(1000, len(entity_aggs))\n",
|
|
1106
|
+
" scatter_sample = entity_aggs.sample(sample_size, random_state=42) if sample_size < len(entity_aggs) else entity_aggs\n",
|
|
1107
|
+
" \n",
|
|
1108
|
+
" print(f\"Scatter Matrix (n={len(scatter_sample):,} entities)\")\n",
|
|
1109
|
+
" print(f\" Total aggregated features: {len(all_agg_cols)}\")\n",
|
|
1110
|
+
" print(f\" Selected (top 4 by variance): {top_features}\")\n",
|
|
1111
|
+
" \n",
|
|
1112
|
+
" # Short labels for x-axis (no line breaks)\n",
|
|
1113
|
+
" short_labels = [f.replace('_', ' ') for f in top_features]\n",
|
|
1114
|
+
" \n",
|
|
1115
|
+
" scatter_data = scatter_sample[top_features].copy()\n",
|
|
1116
|
+
" scatter_data.columns = short_labels\n",
|
|
1117
|
+
" \n",
|
|
1118
|
+
" fig = charts.scatter_matrix(scatter_data, height=500)\n",
|
|
1119
|
+
" fig.update_traces(marker=dict(opacity=0.5, size=4))\n",
|
|
1120
|
+
" \n",
|
|
1121
|
+
" # Update y-axis labels to be multirow, keep x-axis single row\n",
|
|
1122
|
+
" n_features = len(short_labels)\n",
|
|
1123
|
+
" for i in range(n_features):\n",
|
|
1124
|
+
" # Y-axis: multirow\n",
|
|
1125
|
+
" yaxis_name = f'yaxis{i+1}' if i > 0 else 'yaxis'\n",
|
|
1126
|
+
" y_label = top_features[i].replace('_', '<br>')\n",
|
|
1127
|
+
" fig.update_layout(**{yaxis_name: dict(title=dict(text=y_label))})\n",
|
|
1128
|
+
" \n",
|
|
1129
|
+
" # X-axis: single row (spaces instead of underscores)\n",
|
|
1130
|
+
" xaxis_name = f'xaxis{i+1}' if i > 0 else 'xaxis'\n",
|
|
1131
|
+
" x_label = top_features[i].replace('_', ' ')\n",
|
|
1132
|
+
" fig.update_layout(**{xaxis_name: dict(title=dict(text=x_label))})\n",
|
|
1133
|
+
" \n",
|
|
1134
|
+
" fig.update_layout(\n",
|
|
1135
|
+
" title=\"Feature Relationships (Top 4 by Variance)\",\n",
|
|
1136
|
+
" margin=dict(l=100, r=20, t=50, b=60)\n",
|
|
1137
|
+
" )\n",
|
|
1138
|
+
" \n",
|
|
1139
|
+
" display_figure(fig)\n",
|
|
1140
|
+
" \n",
|
|
1141
|
+
" print(\"\\nš Scatter Matrix Insights:\")\n",
|
|
1142
|
+
" print(\" ⢠Different aggregation types create different patterns/bands\")\n",
|
|
1143
|
+
" print(\" ⢠sum features often show exponential-like distributions\")\n",
|
|
1144
|
+
" print(\" ⢠std features reveal variability clusters\")\n",
|
|
1145
|
+
" print(\" ⢠mean features show central tendency patterns\")\n"
|
|
1146
|
+
]
|
|
1147
|
+
},
|
|
1148
|
+
{
|
|
1149
|
+
"cell_type": "code",
|
|
1150
|
+
"execution_count": null,
|
|
1151
|
+
"id": "7e6d123b",
|
|
1152
|
+
"metadata": {
|
|
1153
|
+
"execution": {
|
|
1154
|
+
"iopub.execute_input": "2026-02-02T13:01:12.323675Z",
|
|
1155
|
+
"iopub.status.busy": "2026-02-02T13:01:12.323562Z",
|
|
1156
|
+
"iopub.status.idle": "2026-02-02T13:01:12.326223Z",
|
|
1157
|
+
"shell.execute_reply": "2026-02-02T13:01:12.325830Z"
|
|
1158
|
+
},
|
|
1159
|
+
"papermill": {
|
|
1160
|
+
"duration": 0.021938,
|
|
1161
|
+
"end_time": "2026-02-02T13:01:12.326774",
|
|
1162
|
+
"exception": false,
|
|
1163
|
+
"start_time": "2026-02-02T13:01:12.304836",
|
|
1164
|
+
"status": "completed"
|
|
1165
|
+
},
|
|
1166
|
+
"tags": []
|
|
1167
|
+
},
|
|
1168
|
+
"outputs": [],
|
|
1169
|
+
"source": [
|
|
1170
|
+
"# Correlation Analysis: Interpretation\n",
|
|
1171
|
+
"print(\"\\n\" + \"=\"*70)\n",
|
|
1172
|
+
"print(\"CORRELATION ANALYSIS SUMMARY\")\n",
|
|
1173
|
+
"print(\"=\"*70)\n",
|
|
1174
|
+
"\n",
|
|
1175
|
+
"if 'high_corr' in dir() and high_corr:\n",
|
|
1176
|
+
" print(f\"\\nš Found {len(high_corr)} highly correlated pairs (|r| > 0.7):\")\n",
|
|
1177
|
+
" for c1, c2, r in sorted(high_corr, key=lambda x: abs(x[2]), reverse=True)[:5]:\n",
|
|
1178
|
+
" print(f\" ⢠{c1} ā {c2}: r={r:.2f}\")\n",
|
|
1179
|
+
" print(\"\\nš” RECOMMENDATIONS:\")\n",
|
|
1180
|
+
" print(\" ā Remove redundant features to reduce multicollinearity\")\n",
|
|
1181
|
+
" print(\" ā Or create composite features from correlated groups\")\n",
|
|
1182
|
+
"else:\n",
|
|
1183
|
+
" print(\"\\nā
No highly correlated pairs detected\")\n",
|
|
1184
|
+
" print(\" ā Features appear independent, good for modeling\")"
|
|
1185
|
+
]
|
|
1186
|
+
},
|
|
1187
|
+
{
|
|
1188
|
+
"cell_type": "markdown",
|
|
1189
|
+
"id": "57364526",
|
|
1190
|
+
"metadata": {
|
|
1191
|
+
"papermill": {
|
|
1192
|
+
"duration": 0.017321,
|
|
1193
|
+
"end_time": "2026-02-02T13:01:12.360755",
|
|
1194
|
+
"exception": false,
|
|
1195
|
+
"start_time": "2026-02-02T13:01:12.343434",
|
|
1196
|
+
"status": "completed"
|
|
1197
|
+
},
|
|
1198
|
+
"tags": []
|
|
1199
|
+
},
|
|
1200
|
+
"source": [
|
|
1201
|
+
"## 1c.9 Temporal Sparklines\n",
|
|
1202
|
+
"\n",
|
|
1203
|
+
"**š Understanding Temporal Trends:**\n",
|
|
1204
|
+
"\n",
|
|
1205
|
+
"Sparklines show how numeric features evolve over time:\n",
|
|
1206
|
+
"\n",
|
|
1207
|
+
"| Pattern | What It Means | Implication |\n",
|
|
1208
|
+
"|---------|--------------|-------------|\n",
|
|
1209
|
+
"| Upward trend | Metric increasing | Growth or engagement |\n",
|
|
1210
|
+
"| Downward trend | Metric decreasing | Decline or churn signal |\n",
|
|
1211
|
+
"| Flat line | Stable metric | Consistent behavior |\n",
|
|
1212
|
+
"| Spikes | Sudden changes | Events or anomalies |"
|
|
1213
|
+
]
|
|
1214
|
+
},
|
|
1215
|
+
{
|
|
1216
|
+
"cell_type": "code",
|
|
1217
|
+
"execution_count": null,
|
|
1218
|
+
"id": "7c1b4a86",
|
|
1219
|
+
"metadata": {
|
|
1220
|
+
"execution": {
|
|
1221
|
+
"iopub.execute_input": "2026-02-02T13:01:12.395184Z",
|
|
1222
|
+
"iopub.status.busy": "2026-02-02T13:01:12.395066Z",
|
|
1223
|
+
"iopub.status.idle": "2026-02-02T13:01:12.659614Z",
|
|
1224
|
+
"shell.execute_reply": "2026-02-02T13:01:12.659019Z"
|
|
1225
|
+
},
|
|
1226
|
+
"papermill": {
|
|
1227
|
+
"duration": 0.282742,
|
|
1228
|
+
"end_time": "2026-02-02T13:01:12.660385",
|
|
1229
|
+
"exception": false,
|
|
1230
|
+
"start_time": "2026-02-02T13:01:12.377643",
|
|
1231
|
+
"status": "completed"
|
|
1232
|
+
},
|
|
1233
|
+
"tags": []
|
|
1234
|
+
},
|
|
1235
|
+
"outputs": [],
|
|
1236
|
+
"source": [
|
|
1237
|
+
"# Temporal Sparklines - Cohort Ć Time Period per feature with analysis\n",
|
|
1238
|
+
"if len(numeric_event_cols) >= 2:\n",
|
|
1239
|
+
" variances = df[numeric_event_cols].var().sort_values(ascending=False)\n",
|
|
1240
|
+
" sparkline_cols = variances.index.tolist()\n",
|
|
1241
|
+
" \n",
|
|
1242
|
+
" print(\"\\n\" + \"=\"*70)\n",
|
|
1243
|
+
" print(\"TEMPORAL SPARKLINES - COHORT Ć TIME PERIOD\")\n",
|
|
1244
|
+
" print(\"=\"*70)\n",
|
|
1245
|
+
" print(f\"\\n{len(sparkline_cols)} features analyzed across Weekly/Monthly/Yearly periods\")\n",
|
|
1246
|
+
" \n",
|
|
1247
|
+
" if ENTITY_COLUMN and TIME_COLUMN:\n",
|
|
1248
|
+
" df_spark = df.copy()\n",
|
|
1249
|
+
" df_spark['_week'] = pd.to_datetime(df_spark[TIME_COLUMN]).dt.to_period('W').dt.start_time\n",
|
|
1250
|
+
" df_spark['_month'] = pd.to_datetime(df_spark[TIME_COLUMN]).dt.to_period('M').dt.start_time\n",
|
|
1251
|
+
" df_spark['_year'] = pd.to_datetime(df_spark[TIME_COLUMN]).dt.to_period('Y').dt.start_time\n",
|
|
1252
|
+
" \n",
|
|
1253
|
+
" has_target = TARGET_COLUMN and TARGET_COLUMN in df.columns\n",
|
|
1254
|
+
" all_actions = []\n",
|
|
1255
|
+
" \n",
|
|
1256
|
+
" for col in sparkline_cols:\n",
|
|
1257
|
+
" if col not in df_spark.columns:\n",
|
|
1258
|
+
" continue\n",
|
|
1259
|
+
" \n",
|
|
1260
|
+
" feature_data = {}\n",
|
|
1261
|
+
" cohort_masks = ([(\"retained\", df_spark[TARGET_COLUMN] == 1),\n",
|
|
1262
|
+
" (\"churned\", df_spark[TARGET_COLUMN] == 0),\n",
|
|
1263
|
+
" (\"overall\", slice(None))] if has_target \n",
|
|
1264
|
+
" else [(\"overall\", slice(None))])\n",
|
|
1265
|
+
" \n",
|
|
1266
|
+
" for cohort, mask in cohort_masks:\n",
|
|
1267
|
+
" cohort_df = df_spark[mask] if isinstance(mask, pd.Series) else df_spark\n",
|
|
1268
|
+
" feature_data[cohort] = {\n",
|
|
1269
|
+
" \"weekly\": cohort_df.groupby('_week')[col].mean().dropna().tolist(),\n",
|
|
1270
|
+
" \"monthly\": cohort_df.groupby('_month')[col].mean().dropna().tolist(),\n",
|
|
1271
|
+
" \"yearly\": cohort_df.groupby('_year')[col].mean().dropna().tolist(),\n",
|
|
1272
|
+
" }\n",
|
|
1273
|
+
" \n",
|
|
1274
|
+
" period_effects = None\n",
|
|
1275
|
+
" if has_target:\n",
|
|
1276
|
+
" analysis = charts.analyze_cohort_trends(feature_data, col)\n",
|
|
1277
|
+
" period_effects = {p: analysis[\"periods\"][p][\"effect_size\"] \n",
|
|
1278
|
+
" for p in analysis[\"periods\"]}\n",
|
|
1279
|
+
" all_actions.extend(analysis.get(\"actions\", []))\n",
|
|
1280
|
+
" \n",
|
|
1281
|
+
" fig = charts.cohort_sparklines(feature_data, feature_name=col, period_effects=period_effects)\n",
|
|
1282
|
+
" display_figure(fig)\n",
|
|
1283
|
+
" \n",
|
|
1284
|
+
" if has_target and all_actions:\n",
|
|
1285
|
+
" print(\"\\n\" + \"=\"*70)\n",
|
|
1286
|
+
" print(\"TREND & VARIANCE RECOMMENDATIONS\")\n",
|
|
1287
|
+
" print(\"=\"*70)\n",
|
|
1288
|
+
" \n",
|
|
1289
|
+
" BOLD, RESET = \"\\033[1m\", \"\\033[0m\"\n",
|
|
1290
|
+
" \n",
|
|
1291
|
+
" type_labels = {\n",
|
|
1292
|
+
" \"add_trend_feature\": \"š Add Trend Features (opposite cohort trends)\",\n",
|
|
1293
|
+
" \"add_time_indicator\": \"š
Add Time Indicators (seasonality detected)\",\n",
|
|
1294
|
+
" \"robust_scale\": \"š§ Apply Robust Scaling (high variance ratio)\",\n",
|
|
1295
|
+
" \"normalize\": \"š Apply Normalization (high variance)\",\n",
|
|
1296
|
+
" }\n",
|
|
1297
|
+
" \n",
|
|
1298
|
+
" by_type = {}\n",
|
|
1299
|
+
" for action in all_actions:\n",
|
|
1300
|
+
" action_type = action[\"action_type\"]\n",
|
|
1301
|
+
" if action_type not in by_type:\n",
|
|
1302
|
+
" by_type[action_type] = []\n",
|
|
1303
|
+
" by_type[action_type].append(action)\n",
|
|
1304
|
+
" \n",
|
|
1305
|
+
" for action_type, actions in by_type.items():\n",
|
|
1306
|
+
" print(f\"\\n{type_labels.get(action_type, action_type)}:\")\n",
|
|
1307
|
+
" for a in actions:\n",
|
|
1308
|
+
" params_str = \", \".join(f\"{k}={v}\" for k, v in a.get(\"params\", {}).items())\n",
|
|
1309
|
+
" print(f\" ⢠{BOLD}{a['feature']}{RESET}: {a['reason']}\")\n",
|
|
1310
|
+
" if params_str:\n",
|
|
1311
|
+
" print(f\" params: {{{params_str}}}\")\n",
|
|
1312
|
+
"else:\n",
|
|
1313
|
+
" print(\"Insufficient numeric columns for sparkline visualization\")\n",
|
|
1314
|
+
"# Store sparkline recommendations for pattern_summary\n",
|
|
1315
|
+
"SPARKLINE_RECOMMENDATIONS = [\n",
|
|
1316
|
+
" {\"action\": a[\"action_type\"], \"feature\": a[\"feature\"], \"reason\": a[\"reason\"],\n",
|
|
1317
|
+
" \"params\": a.get(\"params\", {}), \"priority\": \"high\" if a[\"action_type\"] == \"add_trend_feature\" else \"medium\",\n",
|
|
1318
|
+
" \"features\": [f\"{a['feature']}_{a['action_type']}\"]}\n",
|
|
1319
|
+
" for a in all_actions\n",
|
|
1320
|
+
"] if 'all_actions' in dir() and all_actions else []\n"
|
|
1321
|
+
]
|
|
1322
|
+
},
|
|
1323
|
+
{
|
|
1324
|
+
"cell_type": "markdown",
|
|
1325
|
+
"id": "8e5da120",
|
|
1326
|
+
"metadata": {
|
|
1327
|
+
"papermill": {
|
|
1328
|
+
"duration": 0.034263,
|
|
1329
|
+
"end_time": "2026-02-02T13:01:12.729612",
|
|
1330
|
+
"exception": false,
|
|
1331
|
+
"start_time": "2026-02-02T13:01:12.695349",
|
|
1332
|
+
"status": "completed"
|
|
1333
|
+
},
|
|
1334
|
+
"tags": []
|
|
1335
|
+
},
|
|
1336
|
+
"source": [
|
|
1337
|
+
"## 1c.10 Entity-Level Feature Analysis (Effect Sizes)\n",
|
|
1338
|
+
"\n",
|
|
1339
|
+
"This section uses **three complementary approaches** to understand feature separation:\n",
|
|
1340
|
+
"\n",
|
|
1341
|
+
"| Approach | What It Measures | Output |\n",
|
|
1342
|
+
"|----------|------------------|--------|\n",
|
|
1343
|
+
"| **Cohen's d** | Standardized mean difference | Single number per feature |\n",
|
|
1344
|
+
"| **Correlation** | Linear relationship with target | Single number per feature |\n",
|
|
1345
|
+
"| **Box Plots** | Full distribution by cohort | Visual comparison |\n",
|
|
1346
|
+
"\n",
|
|
1347
|
+
"**š Cohen's d Interpretation:**\n",
|
|
1348
|
+
"\n",
|
|
1349
|
+
"| `\\|d\\|` Value | Effect Size | Predictive Value |\n",
|
|
1350
|
+
"|----------|-------------|------------------|\n",
|
|
1351
|
+
"| ā„ 0.8 | Large | Strong differentiator |\n",
|
|
1352
|
+
"| 0.5-0.8 | Medium | Useful signal |\n",
|
|
1353
|
+
"| 0.2-0.5 | Small | Weak signal |\n",
|
|
1354
|
+
"| < 0.2 | Negligible | Not predictive |\n",
|
|
1355
|
+
"\n",
|
|
1356
|
+
"**Connection to Sparklines (1c.9):** The d values shown in the sparkline column headers are per-period effect sizes. Here we compute entity-level effect sizes across all aggregated features.\n",
|
|
1357
|
+
"\n",
|
|
1358
|
+
"**See also:** Section 1c.8 for scatter matrix showing feature relationships with cohort overlay."
|
|
1359
|
+
]
|
|
1360
|
+
},
|
|
1361
|
+
{
|
|
1362
|
+
"cell_type": "code",
|
|
1363
|
+
"execution_count": null,
|
|
1364
|
+
"id": "2b01ed4f",
|
|
1365
|
+
"metadata": {
|
|
1366
|
+
"execution": {
|
|
1367
|
+
"iopub.execute_input": "2026-02-02T13:01:12.799852Z",
|
|
1368
|
+
"iopub.status.busy": "2026-02-02T13:01:12.799733Z",
|
|
1369
|
+
"iopub.status.idle": "2026-02-02T13:01:12.840137Z",
|
|
1370
|
+
"shell.execute_reply": "2026-02-02T13:01:12.839478Z"
|
|
1371
|
+
},
|
|
1372
|
+
"papermill": {
|
|
1373
|
+
"duration": 0.077605,
|
|
1374
|
+
"end_time": "2026-02-02T13:01:12.841471",
|
|
1375
|
+
"exception": false,
|
|
1376
|
+
"start_time": "2026-02-02T13:01:12.763866",
|
|
1377
|
+
"status": "completed"
|
|
1378
|
+
},
|
|
1379
|
+
"tags": []
|
|
1380
|
+
},
|
|
1381
|
+
"outputs": [],
|
|
1382
|
+
"source": [
|
|
1383
|
+
"# Aggregate event data to entity level for effect size analysis\n",
|
|
1384
|
+
"if ENTITY_COLUMN and TARGET_COLUMN and TARGET_COLUMN in df.columns:\n",
|
|
1385
|
+
" # Build entity-level aggregations\n",
|
|
1386
|
+
" entity_aggs = df.groupby(ENTITY_COLUMN).agg({\n",
|
|
1387
|
+
" TIME_COLUMN: ['count', 'min', 'max'],\n",
|
|
1388
|
+
" **{col: ['mean', 'sum', 'std'] for col in numeric_event_cols if col != TARGET_COLUMN}\n",
|
|
1389
|
+
" })\n",
|
|
1390
|
+
" entity_aggs.columns = ['_'.join(col).strip() for col in entity_aggs.columns]\n",
|
|
1391
|
+
" entity_aggs = entity_aggs.reset_index()\n",
|
|
1392
|
+
" \n",
|
|
1393
|
+
" # Add target\n",
|
|
1394
|
+
" entity_target = df.groupby(ENTITY_COLUMN)[TARGET_COLUMN].first().reset_index()\n",
|
|
1395
|
+
" entity_df = entity_aggs.merge(entity_target, on=ENTITY_COLUMN)\n",
|
|
1396
|
+
" \n",
|
|
1397
|
+
" # Add derived features\n",
|
|
1398
|
+
" entity_df['tenure_days'] = (entity_df[f'{TIME_COLUMN}_max'] - entity_df[f'{TIME_COLUMN}_min']).dt.days\n",
|
|
1399
|
+
" entity_df['event_count'] = entity_df[f'{TIME_COLUMN}_count']\n",
|
|
1400
|
+
" \n",
|
|
1401
|
+
" # Calculate effect sizes (Cohen's d) for entity-level features\n",
|
|
1402
|
+
" # Exclude entity, target, and temporal metadata columns\n",
|
|
1403
|
+
" effect_feature_cols = [c for c in entity_df.select_dtypes(include=[np.number]).columns\n",
|
|
1404
|
+
" if c not in [ENTITY_COLUMN, TARGET_COLUMN]\n",
|
|
1405
|
+
" and c not in TEMPORAL_METADATA_COLS]\n",
|
|
1406
|
+
" \n",
|
|
1407
|
+
" print(\"=\"*80)\n",
|
|
1408
|
+
" print(\"ENTITY-LEVEL FEATURE EFFECT SIZES (Cohen's d)\")\n",
|
|
1409
|
+
" print(\"=\"*80)\n",
|
|
1410
|
+
" print(f\"\\nAnalyzing {len(effect_feature_cols)} aggregated features at entity level\")\n",
|
|
1411
|
+
" print(f\"Entities: {len(entity_df):,} (Retained: {(entity_df[TARGET_COLUMN]==1).sum():,}, Churned: {(entity_df[TARGET_COLUMN]==0).sum():,})\\n\")\n",
|
|
1412
|
+
" \n",
|
|
1413
|
+
" effect_sizes = []\n",
|
|
1414
|
+
" for col in effect_feature_cols:\n",
|
|
1415
|
+
" churned = entity_df[entity_df[TARGET_COLUMN] == 0][col].dropna()\n",
|
|
1416
|
+
" retained = entity_df[entity_df[TARGET_COLUMN] == 1][col].dropna()\n",
|
|
1417
|
+
" \n",
|
|
1418
|
+
" if len(churned) > 0 and len(retained) > 0:\n",
|
|
1419
|
+
" pooled_std = np.sqrt(((len(churned)-1)*churned.std()**2 + (len(retained)-1)*retained.std()**2) / \n",
|
|
1420
|
+
" (len(churned) + len(retained) - 2))\n",
|
|
1421
|
+
" d = (retained.mean() - churned.mean()) / pooled_std if pooled_std > 0 else 0\n",
|
|
1422
|
+
" \n",
|
|
1423
|
+
" abs_d = abs(d)\n",
|
|
1424
|
+
" if abs_d >= 0.8:\n",
|
|
1425
|
+
" interp, emoji = \"Large effect\", \"š“\"\n",
|
|
1426
|
+
" elif abs_d >= 0.5:\n",
|
|
1427
|
+
" interp, emoji = \"Medium effect\", \"š”\"\n",
|
|
1428
|
+
" elif abs_d >= 0.2:\n",
|
|
1429
|
+
" interp, emoji = \"Small effect\", \"š¢\"\n",
|
|
1430
|
+
" else:\n",
|
|
1431
|
+
" interp, emoji = \"Negligible\", \"āŖ\"\n",
|
|
1432
|
+
" \n",
|
|
1433
|
+
" effect_sizes.append({\n",
|
|
1434
|
+
" \"feature\": col, \"cohens_d\": d, \"abs_d\": abs_d, \n",
|
|
1435
|
+
" \"interpretation\": interp, \"emoji\": emoji,\n",
|
|
1436
|
+
" \"retained_mean\": retained.mean(), \"churned_mean\": churned.mean()\n",
|
|
1437
|
+
" })\n",
|
|
1438
|
+
" \n",
|
|
1439
|
+
" # Sort and display\n",
|
|
1440
|
+
" effect_df = pd.DataFrame(effect_sizes).sort_values(\"abs_d\", ascending=False)\n",
|
|
1441
|
+
" \n",
|
|
1442
|
+
" print(f\"{'Feature':<35} {'d':>8} {'Effect':<15} {'Direction':<20}\")\n",
|
|
1443
|
+
" print(\"-\" * 80)\n",
|
|
1444
|
+
" for _, row in effect_df.head(15).iterrows():\n",
|
|
1445
|
+
" direction = \"ā Higher in retained\" if row[\"cohens_d\"] > 0 else \"ā Lower in retained\"\n",
|
|
1446
|
+
" print(f\"{row['emoji']} {row['feature'][:33]:<33} {row['cohens_d']:>+8.3f} {row['interpretation']:<15} {direction:<20}\")\n",
|
|
1447
|
+
" \n",
|
|
1448
|
+
" # Categorize features\n",
|
|
1449
|
+
" large_effect = effect_df[effect_df[\"abs_d\"] >= 0.8][\"feature\"].tolist()\n",
|
|
1450
|
+
" medium_effect = effect_df[(effect_df[\"abs_d\"] >= 0.5) & (effect_df[\"abs_d\"] < 0.8)][\"feature\"].tolist()\n",
|
|
1451
|
+
" small_effect = effect_df[(effect_df[\"abs_d\"] >= 0.2) & (effect_df[\"abs_d\"] < 0.5)][\"feature\"].tolist()\n",
|
|
1452
|
+
" \n",
|
|
1453
|
+
" # INTERPRETATION\n",
|
|
1454
|
+
" print(\"\\n\" + \"ā\"*80)\n",
|
|
1455
|
+
" print(\"š INTERPRETATION & RECOMMENDATIONS\")\n",
|
|
1456
|
+
" print(\"ā\"*80)\n",
|
|
1457
|
+
" \n",
|
|
1458
|
+
" if large_effect:\n",
|
|
1459
|
+
" print(f\"\\nš“ LARGE EFFECT (|d| ā„ 0.8) - Priority Features:\")\n",
|
|
1460
|
+
" for f in large_effect[:5]:\n",
|
|
1461
|
+
" row = effect_df[effect_df[\"feature\"] == f].iloc[0]\n",
|
|
1462
|
+
" direction = \"higher\" if row[\"cohens_d\"] > 0 else \"lower\"\n",
|
|
1463
|
+
" print(f\" ⢠{f}: Retained customers have {direction} values\")\n",
|
|
1464
|
+
" print(f\" Mean: Retained={row['retained_mean']:.2f}, Churned={row['churned_mean']:.2f}\")\n",
|
|
1465
|
+
" print(\" ā MUST include in predictive model\")\n",
|
|
1466
|
+
" \n",
|
|
1467
|
+
" if medium_effect:\n",
|
|
1468
|
+
" print(f\"\\nš” MEDIUM EFFECT (0.5 ⤠|d| < 0.8) - Useful Features:\")\n",
|
|
1469
|
+
" for f in medium_effect[:3]:\n",
|
|
1470
|
+
" print(f\" ⢠{f}\")\n",
|
|
1471
|
+
" print(\" ā Should include in model\")\n",
|
|
1472
|
+
" \n",
|
|
1473
|
+
" if small_effect:\n",
|
|
1474
|
+
" print(f\"\\nš¢ SMALL EFFECT (0.2 ⤠|d| < 0.5) - Supporting Features:\")\n",
|
|
1475
|
+
" print(f\" {', '.join(small_effect[:5])}\")\n",
|
|
1476
|
+
" print(\" ā May help in combination with other features\")\n",
|
|
1477
|
+
" \n",
|
|
1478
|
+
" negligible = effect_df[effect_df[\"abs_d\"] < 0.2][\"feature\"].tolist()\n",
|
|
1479
|
+
" if negligible:\n",
|
|
1480
|
+
" print(f\"\\nāŖ NEGLIGIBLE EFFECT (|d| < 0.2): {len(negligible)} features\")\n",
|
|
1481
|
+
" print(\" ā Consider engineering or dropping from model\")\n",
|
|
1482
|
+
"else:\n",
|
|
1483
|
+
" print(\"Entity column or target not available for effect size analysis\")"
|
|
1484
|
+
]
|
|
1485
|
+
},
|
|
1486
|
+
{
|
|
1487
|
+
"cell_type": "code",
|
|
1488
|
+
"execution_count": null,
|
|
1489
|
+
"id": "08b6ce74",
|
|
1490
|
+
"metadata": {
|
|
1491
|
+
"execution": {
|
|
1492
|
+
"iopub.execute_input": "2026-02-02T13:01:12.914564Z",
|
|
1493
|
+
"iopub.status.busy": "2026-02-02T13:01:12.914442Z",
|
|
1494
|
+
"iopub.status.idle": "2026-02-02T13:01:12.947896Z",
|
|
1495
|
+
"shell.execute_reply": "2026-02-02T13:01:12.947462Z"
|
|
1496
|
+
},
|
|
1497
|
+
"papermill": {
|
|
1498
|
+
"duration": 0.069708,
|
|
1499
|
+
"end_time": "2026-02-02T13:01:12.949169",
|
|
1500
|
+
"exception": false,
|
|
1501
|
+
"start_time": "2026-02-02T13:01:12.879461",
|
|
1502
|
+
"status": "completed"
|
|
1503
|
+
},
|
|
1504
|
+
"tags": []
|
|
1505
|
+
},
|
|
1506
|
+
"outputs": [],
|
|
1507
|
+
"source": [
|
|
1508
|
+
"# Box Plots: Entity-level feature distributions by target\n",
|
|
1509
|
+
"if ENTITY_COLUMN and TARGET_COLUMN and 'entity_df' in dir() and len(effect_df) > 0:\n",
|
|
1510
|
+
" # Select top features by effect size for visualization\n",
|
|
1511
|
+
" top_features = effect_df.head(6)[\"feature\"].tolist()\n",
|
|
1512
|
+
" n_features = len(top_features)\n",
|
|
1513
|
+
" \n",
|
|
1514
|
+
" if n_features > 0:\n",
|
|
1515
|
+
" print(\"=\"*70)\n",
|
|
1516
|
+
" print(\"DISTRIBUTION COMPARISON: Retained vs Churned (Box Plots)\")\n",
|
|
1517
|
+
" print(\"=\"*70)\n",
|
|
1518
|
+
" print(\"\\nš Showing top 6 features by effect size\")\n",
|
|
1519
|
+
" print(\" š¢ Green = Retained | š“ Red = Churned\\n\")\n",
|
|
1520
|
+
" \n",
|
|
1521
|
+
" fig = make_subplots(rows=1, cols=n_features, subplot_titles=top_features, horizontal_spacing=0.05)\n",
|
|
1522
|
+
" \n",
|
|
1523
|
+
" for i, col in enumerate(top_features):\n",
|
|
1524
|
+
" col_num = i + 1\n",
|
|
1525
|
+
" \n",
|
|
1526
|
+
" # Retained (1) - Green\n",
|
|
1527
|
+
" retained_data = entity_df[entity_df[TARGET_COLUMN] == 1][col].dropna()\n",
|
|
1528
|
+
" fig.add_trace(go.Box(y=retained_data, name='Retained',\n",
|
|
1529
|
+
" fillcolor='rgba(46, 204, 113, 0.7)', line=dict(color='#1e8449', width=2),\n",
|
|
1530
|
+
" boxpoints='outliers', width=0.35, showlegend=(i == 0), legendgroup='retained',\n",
|
|
1531
|
+
" marker=dict(color='rgba(46, 204, 113, 0.5)', size=4)), row=1, col=col_num)\n",
|
|
1532
|
+
" \n",
|
|
1533
|
+
" # Churned (0) - Red\n",
|
|
1534
|
+
" churned_data = entity_df[entity_df[TARGET_COLUMN] == 0][col].dropna()\n",
|
|
1535
|
+
" fig.add_trace(go.Box(y=churned_data, name='Churned',\n",
|
|
1536
|
+
" fillcolor='rgba(231, 76, 60, 0.7)', line=dict(color='#922b21', width=2),\n",
|
|
1537
|
+
" boxpoints='outliers', width=0.35, showlegend=(i == 0), legendgroup='churned',\n",
|
|
1538
|
+
" marker=dict(color='rgba(231, 76, 60, 0.5)', size=4)), row=1, col=col_num)\n",
|
|
1539
|
+
" \n",
|
|
1540
|
+
" fig.update_layout(height=450, title_text=\"Top Features: Retained (Green) vs Churned (Red)\",\n",
|
|
1541
|
+
" template='plotly_white', showlegend=True, boxmode='group',\n",
|
|
1542
|
+
" legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.05, xanchor=\"center\", x=0.5))\n",
|
|
1543
|
+
" fig.update_xaxes(showticklabels=False)\n",
|
|
1544
|
+
" display_figure(fig)\n",
|
|
1545
|
+
" \n",
|
|
1546
|
+
" # INTERPRETATION\n",
|
|
1547
|
+
" print(\"ā\"*70)\n",
|
|
1548
|
+
" print(\"š HOW TO READ BOX PLOTS\")\n",
|
|
1549
|
+
" print(\"ā\"*70)\n",
|
|
1550
|
+
" print(\"\"\"\n",
|
|
1551
|
+
"Box Plot Elements:\n",
|
|
1552
|
+
" ⢠Box = Middle 50% of data (IQR: 25th to 75th percentile)\n",
|
|
1553
|
+
" ⢠Line inside box = Median (50th percentile)\n",
|
|
1554
|
+
" ⢠Whiskers = 1.5 à IQR from box edges\n",
|
|
1555
|
+
" ⢠Dots outside = Outliers\n",
|
|
1556
|
+
"\n",
|
|
1557
|
+
"What makes a good predictor:\n",
|
|
1558
|
+
" ā Clear SEPARATION between green and red boxes\n",
|
|
1559
|
+
" ā Different MEDIANS (center lines at different heights)\n",
|
|
1560
|
+
" ā Minimal OVERLAP between boxes\n",
|
|
1561
|
+
"\n",
|
|
1562
|
+
"Patterns to look for:\n",
|
|
1563
|
+
" ⢠Green box entirely above red ā Retained have higher values\n",
|
|
1564
|
+
" ⢠Green box entirely below red ā Retained have lower values\n",
|
|
1565
|
+
" ⢠Overlapping boxes ā Feature alone may not discriminate well\n",
|
|
1566
|
+
" ⢠Many outliers in one group ā Subpopulations worth investigating\n",
|
|
1567
|
+
"\"\"\")"
|
|
1568
|
+
]
|
|
1569
|
+
},
|
|
1570
|
+
{
|
|
1571
|
+
"cell_type": "code",
|
|
1572
|
+
"execution_count": null,
|
|
1573
|
+
"id": "58ee08bc",
|
|
1574
|
+
"metadata": {
|
|
1575
|
+
"execution": {
|
|
1576
|
+
"iopub.execute_input": "2026-02-02T13:01:13.024026Z",
|
|
1577
|
+
"iopub.status.busy": "2026-02-02T13:01:13.023843Z",
|
|
1578
|
+
"iopub.status.idle": "2026-02-02T13:01:13.041333Z",
|
|
1579
|
+
"shell.execute_reply": "2026-02-02T13:01:13.040834Z"
|
|
1580
|
+
},
|
|
1581
|
+
"papermill": {
|
|
1582
|
+
"duration": 0.055619,
|
|
1583
|
+
"end_time": "2026-02-02T13:01:13.041926",
|
|
1584
|
+
"exception": false,
|
|
1585
|
+
"start_time": "2026-02-02T13:01:12.986307",
|
|
1586
|
+
"status": "completed"
|
|
1587
|
+
},
|
|
1588
|
+
"tags": []
|
|
1589
|
+
},
|
|
1590
|
+
"outputs": [],
|
|
1591
|
+
"source": [
|
|
1592
|
+
"# Feature-Target Correlation Ranking\n",
|
|
1593
|
+
"if ENTITY_COLUMN and TARGET_COLUMN and 'entity_df' in dir():\n",
|
|
1594
|
+
" print(\"=\"*70)\n",
|
|
1595
|
+
" print(\"FEATURE-TARGET CORRELATIONS (Entity-Level)\")\n",
|
|
1596
|
+
" print(\"=\"*70)\n",
|
|
1597
|
+
" \n",
|
|
1598
|
+
" correlations = []\n",
|
|
1599
|
+
" for col in effect_feature_cols:\n",
|
|
1600
|
+
" if col != TARGET_COLUMN:\n",
|
|
1601
|
+
" corr = entity_df[[col, TARGET_COLUMN]].corr().iloc[0, 1]\n",
|
|
1602
|
+
" if not np.isnan(corr):\n",
|
|
1603
|
+
" correlations.append({\"Feature\": col, \"Correlation\": corr})\n",
|
|
1604
|
+
" \n",
|
|
1605
|
+
" if correlations:\n",
|
|
1606
|
+
" corr_df = pd.DataFrame(correlations).sort_values(\"Correlation\", key=abs, ascending=False)\n",
|
|
1607
|
+
" \n",
|
|
1608
|
+
" fig = charts.bar_chart(\n",
|
|
1609
|
+
" corr_df[\"Feature\"].head(12).tolist(),\n",
|
|
1610
|
+
" corr_df[\"Correlation\"].head(12).tolist(),\n",
|
|
1611
|
+
" title=f\"Feature Correlations with {TARGET_COLUMN}\"\n",
|
|
1612
|
+
" )\n",
|
|
1613
|
+
" display_figure(fig)\n",
|
|
1614
|
+
" \n",
|
|
1615
|
+
" print(\"\\nš Correlation Rankings:\")\n",
|
|
1616
|
+
" print(f\"{'Feature':<35} {'Correlation':>12} {'Strength':<15} {'Direction'}\")\n",
|
|
1617
|
+
" print(\"-\" * 75)\n",
|
|
1618
|
+
" \n",
|
|
1619
|
+
" for _, row in corr_df.head(10).iterrows():\n",
|
|
1620
|
+
" abs_corr = abs(row[\"Correlation\"])\n",
|
|
1621
|
+
" if abs_corr >= 0.5:\n",
|
|
1622
|
+
" strength = \"Strong\"\n",
|
|
1623
|
+
" elif abs_corr >= 0.3:\n",
|
|
1624
|
+
" strength = \"Moderate\"\n",
|
|
1625
|
+
" elif abs_corr >= 0.1:\n",
|
|
1626
|
+
" strength = \"Weak\"\n",
|
|
1627
|
+
" else:\n",
|
|
1628
|
+
" strength = \"Very weak\"\n",
|
|
1629
|
+
" \n",
|
|
1630
|
+
" direction = \"Positive\" if row[\"Correlation\"] > 0 else \"Negative\"\n",
|
|
1631
|
+
" print(f\"{row['Feature'][:34]:<35} {row['Correlation']:>+12.3f} {strength:<15} {direction}\")\n",
|
|
1632
|
+
" \n",
|
|
1633
|
+
" # INTERPRETATION\n",
|
|
1634
|
+
" print(\"\\n\" + \"ā\"*70)\n",
|
|
1635
|
+
" print(\"š INTERPRETING CORRELATIONS WITH TARGET\")\n",
|
|
1636
|
+
" print(\"ā\"*70)\n",
|
|
1637
|
+
" print(\"\"\"\n",
|
|
1638
|
+
"Correlation with binary target (retained=1, churned=0):\n",
|
|
1639
|
+
"\n",
|
|
1640
|
+
" Positive correlation (+): Higher values ā more likely RETAINED\n",
|
|
1641
|
+
" Negative correlation (-): Higher values ā more likely CHURNED\n",
|
|
1642
|
+
"\n",
|
|
1643
|
+
"Strength guide:\n",
|
|
1644
|
+
" |r| > 0.5: Strong - prioritize this feature\n",
|
|
1645
|
+
" |r| 0.3-0.5: Moderate - useful predictor\n",
|
|
1646
|
+
" |r| 0.1-0.3: Weak - may help in combination\n",
|
|
1647
|
+
" |r| < 0.1: Very weak - limited predictive value\n",
|
|
1648
|
+
"\n",
|
|
1649
|
+
"Note: Correlation captures LINEAR relationships only.\n",
|
|
1650
|
+
"Non-linear relationships may have low correlation but still be predictive.\n",
|
|
1651
|
+
"\"\"\")"
|
|
1652
|
+
]
|
|
1653
|
+
},
|
|
1654
|
+
{
|
|
1655
|
+
"cell_type": "code",
|
|
1656
|
+
"execution_count": null,
|
|
1657
|
+
"id": "cf266d12",
|
|
1658
|
+
"metadata": {
|
|
1659
|
+
"execution": {
|
|
1660
|
+
"iopub.execute_input": "2026-02-02T13:01:13.166795Z",
|
|
1661
|
+
"iopub.status.busy": "2026-02-02T13:01:13.166685Z",
|
|
1662
|
+
"iopub.status.idle": "2026-02-02T13:01:13.172173Z",
|
|
1663
|
+
"shell.execute_reply": "2026-02-02T13:01:13.171764Z"
|
|
1664
|
+
},
|
|
1665
|
+
"papermill": {
|
|
1666
|
+
"duration": 0.045238,
|
|
1667
|
+
"end_time": "2026-02-02T13:01:13.173037",
|
|
1668
|
+
"exception": false,
|
|
1669
|
+
"start_time": "2026-02-02T13:01:13.127799",
|
|
1670
|
+
"status": "completed"
|
|
1671
|
+
},
|
|
1672
|
+
"tags": []
|
|
1673
|
+
},
|
|
1674
|
+
"outputs": [],
|
|
1675
|
+
"source": [
|
|
1676
|
+
"# Entity-Level Analysis: Summary\n",
|
|
1677
|
+
"print(\"\\n\" + \"=\"*70)\n",
|
|
1678
|
+
"print(\"ENTITY-LEVEL FEATURE SUMMARY\")\n",
|
|
1679
|
+
"print(\"=\"*70)\n",
|
|
1680
|
+
"\n",
|
|
1681
|
+
"if 'effect_df' in dir() and len(effect_df) > 0:\n",
|
|
1682
|
+
" large_effects = effect_df[effect_df['cohens_d'].abs() >= 0.5]\n",
|
|
1683
|
+
" print(f\"\\nš Effect Size Summary:\")\n",
|
|
1684
|
+
" print(f\" ⢠Total features analyzed: {len(effect_df)}\")\n",
|
|
1685
|
+
" print(f\" ⢠Features with |d| ℠0.5 (medium+): {len(large_effects)}\")\n",
|
|
1686
|
+
" print(f\" ⢠Features with |d| < 0.2 (negligible): {(effect_df['cohens_d'].abs() < 0.2).sum()}\")\n",
|
|
1687
|
+
" \n",
|
|
1688
|
+
" if len(large_effects) > 0:\n",
|
|
1689
|
+
" print(\"\\n Top differentiators:\")\n",
|
|
1690
|
+
" for _, row in large_effects.head(5).iterrows():\n",
|
|
1691
|
+
" direction = \"ā higher in retained\" if row['cohens_d'] > 0 else \"ā lower in retained\"\n",
|
|
1692
|
+
" print(f\" ⢠\\033[1m{row['feature']}\\033[0m: d={row['cohens_d']:+.2f} ({direction})\")\n",
|
|
1693
|
+
" \n",
|
|
1694
|
+
" print(\"\\nš What the Three Approaches Showed:\")\n",
|
|
1695
|
+
" print(\" ⢠Cohen's d ā identified features with strongest mean separation\")\n",
|
|
1696
|
+
" print(\" ⢠Correlation ā confirmed linear relationship direction\")\n",
|
|
1697
|
+
" print(\" ⢠Box plots ā revealed distribution shapes and outliers\")\n",
|
|
1698
|
+
" \n",
|
|
1699
|
+
" print(\"\\nš” RECOMMENDATIONS:\")\n",
|
|
1700
|
+
" print(\" ā Prioritize features with |d| > 0.5 in model\")\n",
|
|
1701
|
+
" print(\" ā Consider dropping features with |d| < 0.2\")\n",
|
|
1702
|
+
" print(\" ā Check box plots for non-normal distributions that may need transformation\")\n",
|
|
1703
|
+
"else:\n",
|
|
1704
|
+
" print(\"\\nā ļø Effect size analysis not performed\")\n",
|
|
1705
|
+
"\n",
|
|
1706
|
+
"# Store effect size recommendations for pattern_summary\n",
|
|
1707
|
+
"EFFECT_SIZE_RECOMMENDATIONS = []\n",
|
|
1708
|
+
"if 'effect_df' in dir() and len(effect_df) > 0:\n",
|
|
1709
|
+
" for _, row in effect_df.iterrows():\n",
|
|
1710
|
+
" abs_d = abs(row['cohens_d'])\n",
|
|
1711
|
+
" if abs_d >= 0.5:\n",
|
|
1712
|
+
" EFFECT_SIZE_RECOMMENDATIONS.append({\n",
|
|
1713
|
+
" \"action\": \"prioritize_feature\", \"feature\": row['feature'],\n",
|
|
1714
|
+
" \"effect_size\": row['cohens_d'], \"priority\": \"high\" if abs_d >= 0.8 else \"medium\",\n",
|
|
1715
|
+
" \"reason\": f\"Cohen's d={row['cohens_d']:.2f} shows {'large' if abs_d >= 0.8 else 'medium'} effect\",\n",
|
|
1716
|
+
" \"features\": [row['feature']]\n",
|
|
1717
|
+
" })\n",
|
|
1718
|
+
" elif abs_d < 0.2:\n",
|
|
1719
|
+
" EFFECT_SIZE_RECOMMENDATIONS.append({\n",
|
|
1720
|
+
" \"action\": \"consider_dropping\", \"feature\": row['feature'],\n",
|
|
1721
|
+
" \"effect_size\": row['cohens_d'], \"priority\": \"low\",\n",
|
|
1722
|
+
" \"reason\": f\"Cohen's d={row['cohens_d']:.2f} shows negligible effect\",\n",
|
|
1723
|
+
" \"features\": [] # No feature to add, just informational\n",
|
|
1724
|
+
" })\n"
|
|
1725
|
+
]
|
|
1726
|
+
},
|
|
1727
|
+
{
|
|
1728
|
+
"cell_type": "markdown",
|
|
1729
|
+
"id": "5743176c",
|
|
1730
|
+
"metadata": {
|
|
1731
|
+
"papermill": {
|
|
1732
|
+
"duration": 0.039847,
|
|
1733
|
+
"end_time": "2026-02-02T13:01:13.251004",
|
|
1734
|
+
"exception": false,
|
|
1735
|
+
"start_time": "2026-02-02T13:01:13.211157",
|
|
1736
|
+
"status": "completed"
|
|
1737
|
+
},
|
|
1738
|
+
"tags": []
|
|
1739
|
+
},
|
|
1740
|
+
"source": [
|
|
1741
|
+
"## 1c.11 Recency Analysis\n",
|
|
1742
|
+
"\n",
|
|
1743
|
+
"**š What is Recency?**\n",
|
|
1744
|
+
"Days since each entity's last event. A key predictor in churn models.\n",
|
|
1745
|
+
"\n",
|
|
1746
|
+
"**š How to Read the Panel:**\n",
|
|
1747
|
+
"- **Top Row**: Distribution histograms for Retained vs Churned\n",
|
|
1748
|
+
" - Compare shapes: Similar = weak signal, Different = strong signal\n",
|
|
1749
|
+
" - Compare medians: Large gap = recency discriminates well\n",
|
|
1750
|
+
"- **Bottom Left**: Target rate by recency bucket\n",
|
|
1751
|
+
" - Look for: Monotonic decline, sharp thresholds, or flat patterns\n",
|
|
1752
|
+
" - Inflection points suggest where to create binary flags\n",
|
|
1753
|
+
"\n",
|
|
1754
|
+
"**ā
Pattern Interpretation:**\n",
|
|
1755
|
+
"| Pattern | Meaning | Feature Strategy |\n",
|
|
1756
|
+
"|---------|---------|------------------|\n",
|
|
1757
|
+
"| Monotonic decline | Gradual disengagement | Use continuous recency |\n",
|
|
1758
|
+
"| Threshold/step | Clear activity boundary | Create binary is_active_Nd flag |\n",
|
|
1759
|
+
"| Flat | Recency not predictive | May omit or use only in combination |"
|
|
1760
|
+
]
|
|
1761
|
+
},
|
|
1762
|
+
{
|
|
1763
|
+
"cell_type": "code",
|
|
1764
|
+
"execution_count": null,
|
|
1765
|
+
"id": "a8057048",
|
|
1766
|
+
"metadata": {
|
|
1767
|
+
"execution": {
|
|
1768
|
+
"iopub.execute_input": "2026-02-02T13:01:13.328727Z",
|
|
1769
|
+
"iopub.status.busy": "2026-02-02T13:01:13.328616Z",
|
|
1770
|
+
"iopub.status.idle": "2026-02-02T13:01:13.444922Z",
|
|
1771
|
+
"shell.execute_reply": "2026-02-02T13:01:13.444409Z"
|
|
1772
|
+
},
|
|
1773
|
+
"papermill": {
|
|
1774
|
+
"duration": 0.15618,
|
|
1775
|
+
"end_time": "2026-02-02T13:01:13.445729",
|
|
1776
|
+
"exception": false,
|
|
1777
|
+
"start_time": "2026-02-02T13:01:13.289549",
|
|
1778
|
+
"status": "completed"
|
|
1779
|
+
},
|
|
1780
|
+
"tags": []
|
|
1781
|
+
},
|
|
1782
|
+
"outputs": [],
|
|
1783
|
+
"source": [
|
|
1784
|
+
"# Recency Analysis - Combined visualization and insights\n",
|
|
1785
|
+
"from customer_retention.analysis.visualization import console\n",
|
|
1786
|
+
"from customer_retention.stages.profiling import compare_recency_by_target\n",
|
|
1787
|
+
"\n",
|
|
1788
|
+
"recency_comparison = None\n",
|
|
1789
|
+
"recency_result = None\n",
|
|
1790
|
+
"RECENCY_RECOMMENDATIONS = []\n",
|
|
1791
|
+
"\n",
|
|
1792
|
+
"if ENTITY_COLUMN:\n",
|
|
1793
|
+
" reference_date = df[TIME_COLUMN].max()\n",
|
|
1794
|
+
" \n",
|
|
1795
|
+
" # Compute recency_result for use in summary cells\n",
|
|
1796
|
+
" recency_result = analyzer.analyze_recency(df, ENTITY_COLUMN, TARGET_COLUMN, reference_date)\n",
|
|
1797
|
+
" \n",
|
|
1798
|
+
" if TARGET_COLUMN and TARGET_COLUMN in df.columns:\n",
|
|
1799
|
+
" recency_comparison = compare_recency_by_target(\n",
|
|
1800
|
+
" df, ENTITY_COLUMN, TIME_COLUMN, TARGET_COLUMN, reference_date\n",
|
|
1801
|
+
" )\n",
|
|
1802
|
+
" \n",
|
|
1803
|
+
" if recency_comparison:\n",
|
|
1804
|
+
" # Combined visualization panel\n",
|
|
1805
|
+
" entity_last = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].max().reset_index()\n",
|
|
1806
|
+
" entity_last[\"recency_days\"] = (reference_date - entity_last[TIME_COLUMN]).dt.days\n",
|
|
1807
|
+
" entity_target = df.groupby(ENTITY_COLUMN)[TARGET_COLUMN].first().reset_index()\n",
|
|
1808
|
+
" entity_recency = entity_last.merge(entity_target, on=ENTITY_COLUMN)\n",
|
|
1809
|
+
" cap = entity_recency[\"recency_days\"].quantile(0.99)\n",
|
|
1810
|
+
" entity_capped = entity_recency[entity_recency[\"recency_days\"] <= cap]\n",
|
|
1811
|
+
" \n",
|
|
1812
|
+
" retained = entity_capped[entity_capped[TARGET_COLUMN] == 1][\"recency_days\"].values\n",
|
|
1813
|
+
" churned = entity_capped[entity_capped[TARGET_COLUMN] == 0][\"recency_days\"].values\n",
|
|
1814
|
+
" \n",
|
|
1815
|
+
" fig = charts.recency_analysis_panel(\n",
|
|
1816
|
+
" retained_recency=retained,\n",
|
|
1817
|
+
" churned_recency=churned,\n",
|
|
1818
|
+
" bucket_stats=recency_comparison.bucket_stats,\n",
|
|
1819
|
+
" retained_median=recency_comparison.retained_stats.median,\n",
|
|
1820
|
+
" churned_median=recency_comparison.churned_stats.median,\n",
|
|
1821
|
+
" cap_value=cap\n",
|
|
1822
|
+
" )\n",
|
|
1823
|
+
" display_figure(fig)\n",
|
|
1824
|
+
" \n",
|
|
1825
|
+
" # Key Findings\n",
|
|
1826
|
+
" console.start_section()\n",
|
|
1827
|
+
" console.header(\"Key Findings\")\n",
|
|
1828
|
+
" for insight in recency_comparison.key_findings:\n",
|
|
1829
|
+
" console.info(insight.finding)\n",
|
|
1830
|
+
" console.end_section()\n",
|
|
1831
|
+
" \n",
|
|
1832
|
+
" # Statistics\n",
|
|
1833
|
+
" ret, churn = recency_comparison.retained_stats, recency_comparison.churned_stats\n",
|
|
1834
|
+
" console.start_section()\n",
|
|
1835
|
+
" console.header(\"Detailed Statistics\")\n",
|
|
1836
|
+
" console.metric(\"Retained (n)\", f\"{ret.count:,}\")\n",
|
|
1837
|
+
" console.metric(\"Churned (n)\", f\"{churn.count:,}\")\n",
|
|
1838
|
+
" print(f\"{'Metric':<15} {'Retained':>12} {'Churned':>12} {'Diff':>12}\")\n",
|
|
1839
|
+
" print(\"-\" * 52)\n",
|
|
1840
|
+
" for name, r, c in [(\"Mean\", ret.mean, churn.mean), (\"Median\", ret.median, churn.median), \n",
|
|
1841
|
+
" (\"Std Dev\", ret.std, churn.std)]:\n",
|
|
1842
|
+
" print(f\"{name:<15} {r:>12.1f} {c:>12.1f} {c-r:>+12.1f}\")\n",
|
|
1843
|
+
" console.metric(\"Effect Size\", f\"{recency_comparison.cohens_d:+.2f} ({recency_comparison.effect_interpretation})\")\n",
|
|
1844
|
+
" console.metric(\"Pattern\", recency_comparison.distribution_pattern.replace(\"_\", \" \").title())\n",
|
|
1845
|
+
" if recency_comparison.inflection_bucket:\n",
|
|
1846
|
+
" console.metric(\"Inflection\", recency_comparison.inflection_bucket)\n",
|
|
1847
|
+
" console.end_section()\n",
|
|
1848
|
+
" \n",
|
|
1849
|
+
" # Actionable Recommendations\n",
|
|
1850
|
+
" console.start_section()\n",
|
|
1851
|
+
" console.header(\"Actionable Recommendations\")\n",
|
|
1852
|
+
" RECENCY_RECOMMENDATIONS = recency_comparison.recommendations\n",
|
|
1853
|
+
" for rec in RECENCY_RECOMMENDATIONS:\n",
|
|
1854
|
+
" priority = rec.get(\"priority\", \"medium\")\n",
|
|
1855
|
+
" priority_icon = {\"high\": \"š“\", \"medium\": \"š”\", \"low\": \"š¢\"}.get(priority, \"āŖ\")\n",
|
|
1856
|
+
" console.info(f\"{priority_icon} [{priority.upper()}] {rec['action'].replace('_', ' ').title()}\")\n",
|
|
1857
|
+
" console.info(f\" {rec['reason']}\")\n",
|
|
1858
|
+
" if rec.get(\"features\"):\n",
|
|
1859
|
+
" console.metric(\"Features\", \", \".join(rec[\"features\"]))\n",
|
|
1860
|
+
" console.end_section()\n",
|
|
1861
|
+
" else:\n",
|
|
1862
|
+
" # No target - show basic recency distribution\n",
|
|
1863
|
+
" entity_last = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].max().reset_index()\n",
|
|
1864
|
+
" entity_last[\"recency_days\"] = (reference_date - entity_last[TIME_COLUMN]).dt.days\n",
|
|
1865
|
+
" median_recency = entity_last[\"recency_days\"].median()\n",
|
|
1866
|
+
" cap = entity_last[\"recency_days\"].quantile(0.99)\n",
|
|
1867
|
+
" capped = entity_last[entity_last[\"recency_days\"] <= cap]\n",
|
|
1868
|
+
" \n",
|
|
1869
|
+
" fig = go.Figure()\n",
|
|
1870
|
+
" fig.add_trace(go.Histogram(x=capped[\"recency_days\"], nbinsx=50, marker_color=\"coral\", opacity=0.7))\n",
|
|
1871
|
+
" fig.add_vline(x=median_recency, line_dash=\"solid\", line_color=\"green\", annotation_text=f\"Median: {median_recency:.0f} days\")\n",
|
|
1872
|
+
" fig.update_layout(title=f\"Recency Distribution (capped at {cap:.0f} days)\", xaxis_title=\"Days Since Last Event\", yaxis_title=\"Count\", template=\"plotly_white\", height=400)\n",
|
|
1873
|
+
" display_figure(fig)\n",
|
|
1874
|
+
" \n",
|
|
1875
|
+
" console.start_section()\n",
|
|
1876
|
+
" console.header(\"Recency Statistics\")\n",
|
|
1877
|
+
" console.metric(\"Median\", f\"{median_recency:.0f} days\")\n",
|
|
1878
|
+
" console.metric(\"Mean\", f\"{entity_last['recency_days'].mean():.0f} days\")\n",
|
|
1879
|
+
" console.info(\"No target column - cannot compare retained vs churned\")\n",
|
|
1880
|
+
" console.end_section()"
|
|
1881
|
+
]
|
|
1882
|
+
},
|
|
1883
|
+
{
|
|
1884
|
+
"cell_type": "markdown",
|
|
1885
|
+
"id": "3a214f94",
|
|
1886
|
+
"metadata": {
|
|
1887
|
+
"papermill": {
|
|
1888
|
+
"duration": 0.039497,
|
|
1889
|
+
"end_time": "2026-02-02T13:01:13.525285",
|
|
1890
|
+
"exception": false,
|
|
1891
|
+
"start_time": "2026-02-02T13:01:13.485788",
|
|
1892
|
+
"status": "completed"
|
|
1893
|
+
},
|
|
1894
|
+
"tags": []
|
|
1895
|
+
},
|
|
1896
|
+
"source": [
|
|
1897
|
+
"## 1c.12 Velocity & Acceleration Analysis\n",
|
|
1898
|
+
"\n",
|
|
1899
|
+
"**š Why Velocity and Acceleration Matter:**\n",
|
|
1900
|
+
"\n",
|
|
1901
|
+
"| Metric | Formula | Interpretation |\n",
|
|
1902
|
+
"|--------|---------|----------------|\n",
|
|
1903
|
+
"| **Velocity** | Ī(value) / Īt | Rate of change - is activity speeding up or slowing down? |\n",
|
|
1904
|
+
"| **Acceleration** | Ī(velocity) / Īt | Change in rate - is the slowdown accelerating? |\n",
|
|
1905
|
+
"\n",
|
|
1906
|
+
"**š Analysis Approach:**\n",
|
|
1907
|
+
"\n",
|
|
1908
|
+
"1. **Signal Heatmap**: Effect sizes (Cohen's d) across variable Ć time window combinations\n",
|
|
1909
|
+
" - Shows cohort separation strength at each time scale\n",
|
|
1910
|
+
" - Higher |d| = stronger individual signal, but low |d| features may still help in combinations\n",
|
|
1911
|
+
" \n",
|
|
1912
|
+
"2. **Detailed Sparklines**: For top features (ranked by max |d| across windows)\n",
|
|
1913
|
+
" - Shows ALL time windows for each feature - different scales capture different dynamics\n",
|
|
1914
|
+
" - Retained vs churned velocity/acceleration side by side\n"
|
|
1915
|
+
]
|
|
1916
|
+
},
|
|
1917
|
+
{
|
|
1918
|
+
"cell_type": "code",
|
|
1919
|
+
"execution_count": null,
|
|
1920
|
+
"id": "d5614a91",
|
|
1921
|
+
"metadata": {
|
|
1922
|
+
"execution": {
|
|
1923
|
+
"iopub.execute_input": "2026-02-02T13:01:13.606846Z",
|
|
1924
|
+
"iopub.status.busy": "2026-02-02T13:01:13.606731Z",
|
|
1925
|
+
"iopub.status.idle": "2026-02-02T13:01:13.974618Z",
|
|
1926
|
+
"shell.execute_reply": "2026-02-02T13:01:13.974193Z"
|
|
1927
|
+
},
|
|
1928
|
+
"papermill": {
|
|
1929
|
+
"duration": 0.410128,
|
|
1930
|
+
"end_time": "2026-02-02T13:01:13.976884",
|
|
1931
|
+
"exception": false,
|
|
1932
|
+
"start_time": "2026-02-02T13:01:13.566756",
|
|
1933
|
+
"status": "completed"
|
|
1934
|
+
},
|
|
1935
|
+
"tags": []
|
|
1936
|
+
},
|
|
1937
|
+
"outputs": [],
|
|
1938
|
+
"source": [
|
|
1939
|
+
"# Velocity & Acceleration Cohort Analysis with Effect Size Heatmap\n",
|
|
1940
|
+
"if ENTITY_COLUMN and TARGET_COLUMN and sparkline_cols:\n",
|
|
1941
|
+
" continuous_cols = [c for c in sparkline_cols if df[c].nunique() > 2][:6]\n",
|
|
1942
|
+
" \n",
|
|
1943
|
+
" if not continuous_cols:\n",
|
|
1944
|
+
" print(\"ā ļø No continuous numeric columns found for velocity analysis.\")\n",
|
|
1945
|
+
" else:\n",
|
|
1946
|
+
" print(\"=\"*70)\n",
|
|
1947
|
+
" print(\"VELOCITY & ACCELERATION SIGNAL ANALYSIS\")\n",
|
|
1948
|
+
" print(\"=\"*70)\n",
|
|
1949
|
+
" \n",
|
|
1950
|
+
" if 'feature_analyzer' not in dir():\n",
|
|
1951
|
+
" feature_analyzer = TemporalFeatureAnalyzer(time_column=TIME_COLUMN, entity_column=ENTITY_COLUMN)\n",
|
|
1952
|
+
" \n",
|
|
1953
|
+
" windows = [7, 14, 30, 90, 180, 365]\n",
|
|
1954
|
+
" print(f\"Analyzing {len(continuous_cols)} features across windows: {windows} days\")\n",
|
|
1955
|
+
" \n",
|
|
1956
|
+
" all_results = {}\n",
|
|
1957
|
+
" heatmap_data = {\"velocity\": {}, \"acceleration\": {}}\n",
|
|
1958
|
+
" \n",
|
|
1959
|
+
" for col in continuous_cols:\n",
|
|
1960
|
+
" results = feature_analyzer.compute_cohort_velocity_signals(\n",
|
|
1961
|
+
" df, [col], TARGET_COLUMN, windows=windows\n",
|
|
1962
|
+
" )\n",
|
|
1963
|
+
" all_results[col] = results[col]\n",
|
|
1964
|
+
" heatmap_data[\"velocity\"][col] = {f\"{r.window_days}d\": r.velocity_effect_size for r in results[col]}\n",
|
|
1965
|
+
" heatmap_data[\"acceleration\"][col] = {f\"{r.window_days}d\": r.accel_effect_size for r in results[col]}\n",
|
|
1966
|
+
" \n",
|
|
1967
|
+
" fig = charts.velocity_signal_heatmap(heatmap_data, title=\"Cohort Separation: Velocity & Acceleration Effect Sizes (Cohen's d)\")\n",
|
|
1968
|
+
" display_figure(fig)\n",
|
|
1969
|
+
" \n",
|
|
1970
|
+
" print(\"\\n\" + \"=\"*70)\n",
|
|
1971
|
+
" print(\"DETAILED SPARKLINES (top features)\")\n",
|
|
1972
|
+
" print(\"=\"*70)\n",
|
|
1973
|
+
" \n",
|
|
1974
|
+
" feature_max_d = [(col, max(abs(r.velocity_effect_size) for r in results)) \n",
|
|
1975
|
+
" for col, results in all_results.items()]\n",
|
|
1976
|
+
" feature_max_d.sort(key=lambda x: -x[1])\n",
|
|
1977
|
+
" \n",
|
|
1978
|
+
" top_features = [col for col, _ in feature_max_d[:3]]\n",
|
|
1979
|
+
" for col in top_features:\n",
|
|
1980
|
+
" fig = charts.cohort_velocity_sparklines(all_results[col], feature_name=col)\n",
|
|
1981
|
+
" display_figure(fig)\n",
|
|
1982
|
+
" \n",
|
|
1983
|
+
" print(\"\\n\" + \"ā\"*70)\n",
|
|
1984
|
+
" print(\"š INTERPRETATION\")\n",
|
|
1985
|
+
" print(\"ā\"*70)\n",
|
|
1986
|
+
" print(\"\\nVelocity measures rate of change; acceleration measures change in rate.\")\n",
|
|
1987
|
+
" print(\"Positive d: retained > churned | Negative d: churned > retained\")\n",
|
|
1988
|
+
" print(\"|d| ā„ 0.8: large effect | ā„ 0.5: medium | ā„ 0.2: small\\n\")\n",
|
|
1989
|
+
" \n",
|
|
1990
|
+
" interpretation_notes = feature_analyzer.generate_velocity_interpretation(all_results)\n",
|
|
1991
|
+
" for note in interpretation_notes:\n",
|
|
1992
|
+
" print(note)\n",
|
|
1993
|
+
" \n",
|
|
1994
|
+
" print(\"\\n\" + \"ā\"*70)\n",
|
|
1995
|
+
" print(\"šÆ FEATURE RECOMMENDATIONS\")\n",
|
|
1996
|
+
" print(\"ā\"*70)\n",
|
|
1997
|
+
" \n",
|
|
1998
|
+
" velocity_recs = feature_analyzer.generate_velocity_recommendations(all_results)\n",
|
|
1999
|
+
" if velocity_recs:\n",
|
|
2000
|
+
" for rec in velocity_recs:\n",
|
|
2001
|
+
" priority_marker = \"š“\" if rec.priority == 1 else \"š”\"\n",
|
|
2002
|
+
" print(f\"\\n{priority_marker} {rec.action.upper()}\")\n",
|
|
2003
|
+
" print(f\" Column: {rec.source_column}\")\n",
|
|
2004
|
+
" print(f\" {rec.description}\")\n",
|
|
2005
|
+
" print(f\" Params: {rec.params}\")\n",
|
|
2006
|
+
" else:\n",
|
|
2007
|
+
" print(\"\\nNo velocity/acceleration features recommended (no strong signals found).\")\n",
|
|
2008
|
+
"\n",
|
|
2009
|
+
"# Store velocity recommendations for pattern_summary\n",
|
|
2010
|
+
"VELOCITY_RECOMMENDATIONS = [{\"action\": r.action, \"source_column\": r.source_column, \n",
|
|
2011
|
+
" \"description\": r.description, \"priority\": r.priority,\n",
|
|
2012
|
+
" \"effect_size\": r.effect_size, \"params\": r.params,\n",
|
|
2013
|
+
" \"features\": [f\"{r.source_column}_velocity_{r.params.get('window_days', 7)}d\"]}\n",
|
|
2014
|
+
" for r in velocity_recs] if velocity_recs else []\n"
|
|
2015
|
+
]
|
|
2016
|
+
},
|
|
2017
|
+
{
|
|
2018
|
+
"cell_type": "markdown",
|
|
2019
|
+
"id": "0c9b429a",
|
|
2020
|
+
"metadata": {
|
|
2021
|
+
"papermill": {
|
|
2022
|
+
"duration": 0.062233,
|
|
2023
|
+
"end_time": "2026-02-02T13:01:14.099423",
|
|
2024
|
+
"exception": false,
|
|
2025
|
+
"start_time": "2026-02-02T13:01:14.037190",
|
|
2026
|
+
"status": "completed"
|
|
2027
|
+
},
|
|
2028
|
+
"tags": []
|
|
2029
|
+
},
|
|
2030
|
+
"source": [
|
|
2031
|
+
"## 1c.13 Momentum Analysis (Window Ratios)\n",
|
|
2032
|
+
"\n",
|
|
2033
|
+
"**š What is Momentum?**\n",
|
|
2034
|
+
"\n",
|
|
2035
|
+
"Momentum compares recent activity to historical activity for each customer:\n",
|
|
2036
|
+
"\n",
|
|
2037
|
+
"```\n",
|
|
2038
|
+
"Momentum = mean(value over last N days) / mean(value over last M days)\n",
|
|
2039
|
+
"```\n",
|
|
2040
|
+
"\n",
|
|
2041
|
+
"Where N < M (e.g., 7d/30d compares last week to last month).\n",
|
|
2042
|
+
"\n",
|
|
2043
|
+
"| Momentum Value | Interpretation |\n",
|
|
2044
|
+
"|----------------|----------------|\n",
|
|
2045
|
+
"| > 1.0 | Recent activity higher than historical ā engagement increasing |\n",
|
|
2046
|
+
"| < 1.0 | Recent activity lower than historical ā potential churn signal |\n",
|
|
2047
|
+
"| ā 1.0 | Stable behavior |\n",
|
|
2048
|
+
"\n",
|
|
2049
|
+
"**Window Pairs Analyzed:**\n",
|
|
2050
|
+
"- **Natural pairs** (week/month/quarter): 7d/30d, 30d/90d, 7d/90d\n",
|
|
2051
|
+
"- **Recommended pairs** from `pattern_config` (based on 01a aggregation windows)\n",
|
|
2052
|
+
"- **Accumulation pair**: recent activity vs all-time behavior\n"
|
|
2053
|
+
]
|
|
2054
|
+
},
|
|
2055
|
+
{
|
|
2056
|
+
"cell_type": "code",
|
|
2057
|
+
"execution_count": null,
|
|
2058
|
+
"id": "e67f9f86",
|
|
2059
|
+
"metadata": {
|
|
2060
|
+
"execution": {
|
|
2061
|
+
"iopub.execute_input": "2026-02-02T13:01:14.221835Z",
|
|
2062
|
+
"iopub.status.busy": "2026-02-02T13:01:14.221717Z",
|
|
2063
|
+
"iopub.status.idle": "2026-02-02T13:01:14.612203Z",
|
|
2064
|
+
"shell.execute_reply": "2026-02-02T13:01:14.611709Z"
|
|
2065
|
+
},
|
|
2066
|
+
"papermill": {
|
|
2067
|
+
"duration": 0.453371,
|
|
2068
|
+
"end_time": "2026-02-02T13:01:14.613097",
|
|
2069
|
+
"exception": false,
|
|
2070
|
+
"start_time": "2026-02-02T13:01:14.159726",
|
|
2071
|
+
"status": "completed"
|
|
2072
|
+
},
|
|
2073
|
+
"tags": []
|
|
2074
|
+
},
|
|
2075
|
+
"outputs": [],
|
|
2076
|
+
"source": [
|
|
2077
|
+
"# Momentum Analysis - Cohort Comparison\n",
|
|
2078
|
+
"if ENTITY_COLUMN and TARGET_COLUMN and sparkline_cols:\n",
|
|
2079
|
+
" print(\"=\"*70)\n",
|
|
2080
|
+
" print(\"MOMENTUM ANALYSIS (Window Ratios)\")\n",
|
|
2081
|
+
" print(\"=\"*70)\n",
|
|
2082
|
+
" \n",
|
|
2083
|
+
" if 'feature_analyzer' not in dir():\n",
|
|
2084
|
+
" feature_analyzer = TemporalFeatureAnalyzer(time_column=TIME_COLUMN, entity_column=ENTITY_COLUMN)\n",
|
|
2085
|
+
" \n",
|
|
2086
|
+
" # Use sparkline_cols directly (includes all numeric features ranked by variance)\n",
|
|
2087
|
+
" momentum_cols = sparkline_cols[:6]\n",
|
|
2088
|
+
" \n",
|
|
2089
|
+
" # Build comprehensive window pairs from multiple sources:\n",
|
|
2090
|
+
" # 1. Standard natural pairs (week/month/quarter)\n",
|
|
2091
|
+
" natural_pairs = [(7, 30), (30, 90), (7, 90)]\n",
|
|
2092
|
+
" \n",
|
|
2093
|
+
" # 2. Recommended pairs from pattern_config (based on 01a aggregation windows)\n",
|
|
2094
|
+
" recommended_pairs = pattern_config.get_momentum_pairs()\n",
|
|
2095
|
+
" \n",
|
|
2096
|
+
" # 3. Accumulation pair: shortest window vs all-time\n",
|
|
2097
|
+
" max_days = (df[TIME_COLUMN].max() - df[TIME_COLUMN].min()).days\n",
|
|
2098
|
+
" all_windows = [w for pair in natural_pairs + recommended_pairs for w in pair]\n",
|
|
2099
|
+
" shortest_window = min(all_windows) if all_windows else 7\n",
|
|
2100
|
+
" accumulation_pair = (shortest_window, max_days)\n",
|
|
2101
|
+
" \n",
|
|
2102
|
+
" # Combine and deduplicate (preserve order: natural first, then recommended, then accumulation)\n",
|
|
2103
|
+
" seen = set()\n",
|
|
2104
|
+
" window_pairs = []\n",
|
|
2105
|
+
" for pair in natural_pairs + recommended_pairs + [accumulation_pair]:\n",
|
|
2106
|
+
" if pair not in seen:\n",
|
|
2107
|
+
" window_pairs.append(pair)\n",
|
|
2108
|
+
" seen.add(pair)\n",
|
|
2109
|
+
" \n",
|
|
2110
|
+
" print(f\"Analyzing {len(momentum_cols)} features across {len(window_pairs)} window pairs:\")\n",
|
|
2111
|
+
" print(f\" Natural pairs (week/month/quarter): {natural_pairs}\")\n",
|
|
2112
|
+
" print(f\" Recommended pairs (from 01a): {recommended_pairs}\")\n",
|
|
2113
|
+
" print(f\" Accumulation pair: {shortest_window}d vs all-time ({max_days}d)\")\n",
|
|
2114
|
+
" print(f\" Combined (deduplicated): {len(window_pairs)} pairs\")\n",
|
|
2115
|
+
" \n",
|
|
2116
|
+
" all_momentum_results = {}\n",
|
|
2117
|
+
" for col in momentum_cols:\n",
|
|
2118
|
+
" results = feature_analyzer.compute_cohort_momentum_signals(\n",
|
|
2119
|
+
" df, [col], TARGET_COLUMN, window_pairs=window_pairs\n",
|
|
2120
|
+
" )\n",
|
|
2121
|
+
" all_momentum_results[col] = results[col]\n",
|
|
2122
|
+
" \n",
|
|
2123
|
+
" print(\"\\nš Momentum by Cohort:\")\n",
|
|
2124
|
+
" print(f\"{'Feature':<18} {'Window':<12} {'Retained':>10} {'Churned':>10} {'Effect d':>10}\")\n",
|
|
2125
|
+
" print(\"-\" * 62)\n",
|
|
2126
|
+
" for col, col_results in all_momentum_results.items():\n",
|
|
2127
|
+
" for r in col_results:\n",
|
|
2128
|
+
" label = r.window_label if r.long_window < 1000 else f\"{r.short_window}d/all\"\n",
|
|
2129
|
+
" print(f\"{col[:17]:<18} {label:<12} {r.retained_momentum:>10.2f} {r.churned_momentum:>10.2f} {r.effect_size:>10.2f}\")\n",
|
|
2130
|
+
" \n",
|
|
2131
|
+
" # Bar chart for best window pair per feature - with window labels above bars\n",
|
|
2132
|
+
" best_pair_data = {}\n",
|
|
2133
|
+
" best_window_labels = {} # Track which window was best\n",
|
|
2134
|
+
" for col, col_results in all_momentum_results.items():\n",
|
|
2135
|
+
" best = max(col_results, key=lambda r: abs(r.effect_size))\n",
|
|
2136
|
+
" best_pair_data[col] = {\"retained\": best.retained_momentum, \"churned\": best.churned_momentum}\n",
|
|
2137
|
+
" best_window_labels[col] = best.window_label if best.long_window < 1000 else f\"{best.short_window}d/all\"\n",
|
|
2138
|
+
" \n",
|
|
2139
|
+
" if best_pair_data:\n",
|
|
2140
|
+
" import plotly.graph_objects as go\n",
|
|
2141
|
+
" columns = list(best_pair_data.keys())\n",
|
|
2142
|
+
" col_labels = [c[:15] for c in columns]\n",
|
|
2143
|
+
" \n",
|
|
2144
|
+
" # Find max y value for positioning labels above bars\n",
|
|
2145
|
+
" max_y = max(max(best_pair_data[c][\"retained\"], best_pair_data[c][\"churned\"]) for c in columns)\n",
|
|
2146
|
+
" \n",
|
|
2147
|
+
" fig = go.Figure()\n",
|
|
2148
|
+
" fig.add_trace(go.Bar(\n",
|
|
2149
|
+
" name=\"š¢ Retained\", x=col_labels,\n",
|
|
2150
|
+
" y=[best_pair_data[c][\"retained\"] for c in columns],\n",
|
|
2151
|
+
" marker_color=charts.colors[\"success\"],\n",
|
|
2152
|
+
" ))\n",
|
|
2153
|
+
" fig.add_trace(go.Bar(\n",
|
|
2154
|
+
" name=\"š“ Churned\", x=col_labels,\n",
|
|
2155
|
+
" y=[best_pair_data[c][\"churned\"] for c in columns],\n",
|
|
2156
|
+
" marker_color=charts.colors[\"danger\"],\n",
|
|
2157
|
+
" ))\n",
|
|
2158
|
+
" fig.add_hline(y=1.0, line_dash=\"dash\", line_color=\"gray\",\n",
|
|
2159
|
+
" annotation_text=\"baseline\", annotation_position=\"right\")\n",
|
|
2160
|
+
" \n",
|
|
2161
|
+
" # Add window labels as annotations above each bar group\n",
|
|
2162
|
+
" for i, col in enumerate(columns):\n",
|
|
2163
|
+
" window_lbl = best_window_labels[col]\n",
|
|
2164
|
+
" fig.add_annotation(\n",
|
|
2165
|
+
" x=i, y=max_y * 1.08,\n",
|
|
2166
|
+
" text=f\"<b>{window_lbl}</b>\",\n",
|
|
2167
|
+
" showarrow=False,\n",
|
|
2168
|
+
" font=dict(size=10, color=\"#555\"),\n",
|
|
2169
|
+
" xref=\"x\", yref=\"y\",\n",
|
|
2170
|
+
" )\n",
|
|
2171
|
+
" \n",
|
|
2172
|
+
" fig.update_layout(\n",
|
|
2173
|
+
" title=\"Momentum Comparison (Best Window per Feature)\",\n",
|
|
2174
|
+
" xaxis_title=\"Feature\",\n",
|
|
2175
|
+
" yaxis_title=\"Momentum Ratio\",\n",
|
|
2176
|
+
" barmode=\"group\",\n",
|
|
2177
|
+
" height=400,\n",
|
|
2178
|
+
" yaxis=dict(range=[0, max_y * 1.15]), # Extra headroom for labels\n",
|
|
2179
|
+
" )\n",
|
|
2180
|
+
" display_figure(fig)\n",
|
|
2181
|
+
" \n",
|
|
2182
|
+
" print(\"\\n\" + \"ā\"*70)\n",
|
|
2183
|
+
" print(\"š INTERPRETATION\")\n",
|
|
2184
|
+
" print(\"ā\"*70)\n",
|
|
2185
|
+
" print(\"\\nMomentum = recent_mean / historical_mean (per entity, then averaged)\")\n",
|
|
2186
|
+
" print(\"> 1.0 = accelerating | < 1.0 = decelerating | ā 1.0 = stable\")\n",
|
|
2187
|
+
" print(\"|d| measures how differently retained vs churned customers behave\\n\")\n",
|
|
2188
|
+
" \n",
|
|
2189
|
+
" interpretation_notes = feature_analyzer.generate_momentum_interpretation(all_momentum_results)\n",
|
|
2190
|
+
" for note in interpretation_notes:\n",
|
|
2191
|
+
" print(note)\n",
|
|
2192
|
+
" \n",
|
|
2193
|
+
" print(\"\\n\" + \"ā\"*70)\n",
|
|
2194
|
+
" print(\"šÆ FEATURE RECOMMENDATIONS\")\n",
|
|
2195
|
+
" print(\"ā\"*70)\n",
|
|
2196
|
+
" \n",
|
|
2197
|
+
" momentum_recs = feature_analyzer.generate_momentum_recommendations(all_momentum_results)\n",
|
|
2198
|
+
" if momentum_recs:\n",
|
|
2199
|
+
" for rec in momentum_recs:\n",
|
|
2200
|
+
" priority_marker = \"š“\" if rec.priority == 1 else \"š”\"\n",
|
|
2201
|
+
" print(f\"\\n{priority_marker} {rec.action.upper()}\")\n",
|
|
2202
|
+
" print(f\" Column: {rec.source_column}\")\n",
|
|
2203
|
+
" print(f\" {rec.description}\")\n",
|
|
2204
|
+
" print(f\" Params: {rec.params}\")\n",
|
|
2205
|
+
" else:\n",
|
|
2206
|
+
" print(\"\\nNo momentum features recommended (no strong cohort separation found).\")\n",
|
|
2207
|
+
"\n",
|
|
2208
|
+
"# Store momentum recommendations for pattern_summary\n",
|
|
2209
|
+
"MOMENTUM_RECOMMENDATIONS = [{\"action\": r.action, \"source_column\": r.source_column, \n",
|
|
2210
|
+
" \"description\": r.description, \"priority\": r.priority,\n",
|
|
2211
|
+
" \"effect_size\": r.effect_size, \"params\": r.params,\n",
|
|
2212
|
+
" \"features\": [f\"{r.source_column}_momentum_{r.params['short_window']}_{r.params['long_window']}\"]}\n",
|
|
2213
|
+
" for r in momentum_recs] if momentum_recs else []\n"
|
|
2214
|
+
]
|
|
2215
|
+
},
|
|
2216
|
+
{
|
|
2217
|
+
"cell_type": "markdown",
|
|
2218
|
+
"id": "4e0317dd",
|
|
2219
|
+
"metadata": {
|
|
2220
|
+
"papermill": {
|
|
2221
|
+
"duration": 0.059994,
|
|
2222
|
+
"end_time": "2026-02-02T13:01:14.768128",
|
|
2223
|
+
"exception": false,
|
|
2224
|
+
"start_time": "2026-02-02T13:01:14.708134",
|
|
2225
|
+
"status": "completed"
|
|
2226
|
+
},
|
|
2227
|
+
"tags": []
|
|
2228
|
+
},
|
|
2229
|
+
"source": [
|
|
2230
|
+
"## 1c.14 Lag Correlation Analysis\n",
|
|
2231
|
+
"\n",
|
|
2232
|
+
"**š Why Lag Correlations Matter:**\n",
|
|
2233
|
+
"\n",
|
|
2234
|
+
"Lag correlations show how a metric relates to itself over time:\n",
|
|
2235
|
+
"- High lag-1 correlation: Today's value predicts tomorrow's\n",
|
|
2236
|
+
"- Decaying correlations: Effect diminishes over time\n",
|
|
2237
|
+
"- Periodic spikes: Seasonality (e.g., spike at lag 7 = weekly pattern)"
|
|
2238
|
+
]
|
|
2239
|
+
},
|
|
2240
|
+
{
|
|
2241
|
+
"cell_type": "code",
|
|
2242
|
+
"execution_count": null,
|
|
2243
|
+
"id": "a47c073c",
|
|
2244
|
+
"metadata": {
|
|
2245
|
+
"execution": {
|
|
2246
|
+
"iopub.execute_input": "2026-02-02T13:01:14.892352Z",
|
|
2247
|
+
"iopub.status.busy": "2026-02-02T13:01:14.892235Z",
|
|
2248
|
+
"iopub.status.idle": "2026-02-02T13:01:14.987790Z",
|
|
2249
|
+
"shell.execute_reply": "2026-02-02T13:01:14.987365Z"
|
|
2250
|
+
},
|
|
2251
|
+
"papermill": {
|
|
2252
|
+
"duration": 0.157473,
|
|
2253
|
+
"end_time": "2026-02-02T13:01:14.988366",
|
|
2254
|
+
"exception": false,
|
|
2255
|
+
"start_time": "2026-02-02T13:01:14.830893",
|
|
2256
|
+
"status": "completed"
|
|
2257
|
+
},
|
|
2258
|
+
"tags": []
|
|
2259
|
+
},
|
|
2260
|
+
"outputs": [],
|
|
2261
|
+
"source": [
|
|
2262
|
+
"# Lag Correlation Analysis using TemporalFeatureAnalyzer\n",
|
|
2263
|
+
"if ENTITY_COLUMN and sparkline_cols:\n",
|
|
2264
|
+
" lag_cols = sparkline_cols[:6]\n",
|
|
2265
|
+
" max_lag = 14\n",
|
|
2266
|
+
" \n",
|
|
2267
|
+
" print(\"=\"*70)\n",
|
|
2268
|
+
" print(\"LAG CORRELATION ANALYSIS\")\n",
|
|
2269
|
+
" print(\"=\"*70)\n",
|
|
2270
|
+
" \n",
|
|
2271
|
+
" if 'feature_analyzer' not in dir():\n",
|
|
2272
|
+
" feature_analyzer = TemporalFeatureAnalyzer(time_column=TIME_COLUMN, entity_column=ENTITY_COLUMN)\n",
|
|
2273
|
+
" \n",
|
|
2274
|
+
" # Calculate lag correlations using framework\n",
|
|
2275
|
+
" lag_results = feature_analyzer.calculate_lag_correlations(df, lag_cols, max_lag=max_lag)\n",
|
|
2276
|
+
" \n",
|
|
2277
|
+
" # Build data for heatmap\n",
|
|
2278
|
+
" lag_corr_data = {col: result.correlations for col, result in lag_results.items()}\n",
|
|
2279
|
+
" \n",
|
|
2280
|
+
" # Use ChartBuilder for visualization\n",
|
|
2281
|
+
" fig = charts.lag_correlation_heatmap(\n",
|
|
2282
|
+
" lag_corr_data,\n",
|
|
2283
|
+
" max_lag=max_lag,\n",
|
|
2284
|
+
" title=\"Autocorrelation by Lag (days)\"\n",
|
|
2285
|
+
" )\n",
|
|
2286
|
+
" display_figure(fig)\n",
|
|
2287
|
+
" \n",
|
|
2288
|
+
" # Display framework results\n",
|
|
2289
|
+
" print(\"\\nš Best Lag per Variable:\")\n",
|
|
2290
|
+
" for col, result in lag_results.items():\n",
|
|
2291
|
+
" best_lag_info = f\"best lag={result.best_lag}d (r={result.best_correlation:.2f})\"\n",
|
|
2292
|
+
" weekly_info = \" [Weekly pattern]\" if result.has_weekly_pattern else \"\"\n",
|
|
2293
|
+
" print(f\" {col[:25]}: {best_lag_info}{weekly_info}\")\n",
|
|
2294
|
+
" \n",
|
|
2295
|
+
" # INTERPRETATION SECTION\n",
|
|
2296
|
+
" print(\"\\n\" + \"ā\"*70)\n",
|
|
2297
|
+
" print(\"š INTERPRETATION\")\n",
|
|
2298
|
+
" print(\"ā\"*70)\n",
|
|
2299
|
+
" print(\"\\nLag correlation shows how a variable relates to its PAST values:\")\n",
|
|
2300
|
+
" print(\" ⢠r > 0.5: Strong memory - today predicts tomorrow well\")\n",
|
|
2301
|
+
" print(\" ⢠r 0.3-0.5: Moderate predictability from past\")\n",
|
|
2302
|
+
" print(\" ⢠r < 0.3: Weak autocorrelation - lag features less useful\\n\")\n",
|
|
2303
|
+
" \n",
|
|
2304
|
+
" interpretation_notes = feature_analyzer.generate_lag_interpretation(lag_results)\n",
|
|
2305
|
+
" for note in interpretation_notes:\n",
|
|
2306
|
+
" print(note)\n",
|
|
2307
|
+
" \n",
|
|
2308
|
+
" # RECOMMENDATIONS SECTION\n",
|
|
2309
|
+
" print(\"\\n\" + \"ā\"*70)\n",
|
|
2310
|
+
" print(\"šÆ FEATURE RECOMMENDATIONS\")\n",
|
|
2311
|
+
" print(\"ā\"*70)\n",
|
|
2312
|
+
" \n",
|
|
2313
|
+
" lag_recs = feature_analyzer.generate_lag_recommendations(lag_results)\n",
|
|
2314
|
+
" if lag_recs:\n",
|
|
2315
|
+
" for rec in lag_recs:\n",
|
|
2316
|
+
" priority_marker = \"š“\" if rec.priority == 1 else \"š”\"\n",
|
|
2317
|
+
" print(f\"\\n{priority_marker} {rec.action.upper()}\")\n",
|
|
2318
|
+
" print(f\" Column: {rec.source_column}\")\n",
|
|
2319
|
+
" print(f\" {rec.description}\")\n",
|
|
2320
|
+
" print(f\" Params: {rec.params}\")\n",
|
|
2321
|
+
" else:\n",
|
|
2322
|
+
" print(\"\\nNo lag features recommended (no strong autocorrelation found).\")\n",
|
|
2323
|
+
"\n",
|
|
2324
|
+
"# Store lag recommendations for pattern_summary\n",
|
|
2325
|
+
"LAG_RECOMMENDATIONS = [{\"action\": r.action, \"source_column\": r.source_column, \n",
|
|
2326
|
+
" \"description\": r.description, \"priority\": r.priority,\n",
|
|
2327
|
+
" \"features\": [f\"{r.source_column}_lag_{r.params.get('lag_days', 7)}d\"],\n",
|
|
2328
|
+
" \"params\": r.params}\n",
|
|
2329
|
+
" for r in lag_recs] if lag_recs else []\n"
|
|
2330
|
+
]
|
|
2331
|
+
},
|
|
2332
|
+
{
|
|
2333
|
+
"cell_type": "markdown",
|
|
2334
|
+
"id": "c5ea7432",
|
|
2335
|
+
"metadata": {
|
|
2336
|
+
"papermill": {
|
|
2337
|
+
"duration": 0.061178,
|
|
2338
|
+
"end_time": "2026-02-02T13:01:15.113943",
|
|
2339
|
+
"exception": false,
|
|
2340
|
+
"start_time": "2026-02-02T13:01:15.052765",
|
|
2341
|
+
"status": "completed"
|
|
2342
|
+
},
|
|
2343
|
+
"tags": []
|
|
2344
|
+
},
|
|
2345
|
+
"source": [
|
|
2346
|
+
"## 1c.15 Predictive Power Analysis (IV & KS Statistics)\n",
|
|
2347
|
+
"\n",
|
|
2348
|
+
"**š Information Value (IV) and KS Statistics:**\n",
|
|
2349
|
+
"\n",
|
|
2350
|
+
"These metrics measure how well features predict the target at entity level:\n",
|
|
2351
|
+
"\n",
|
|
2352
|
+
"| Metric | What It Measures | Interpretation |\n",
|
|
2353
|
+
"|--------|------------------|----------------|\n",
|
|
2354
|
+
"| **IV** | Predictive strength across bins | <0.02 weak, 0.02-0.1 medium, 0.1-0.3 strong, >0.3 very strong |\n",
|
|
2355
|
+
"| **KS** | Maximum separation between distributions | Higher = better class separation |\n",
|
|
2356
|
+
"\n",
|
|
2357
|
+
"**How These Relate to Other Sections:**\n",
|
|
2358
|
+
"\n",
|
|
2359
|
+
"| Section | Metric | Relationship to IV/KS |\n",
|
|
2360
|
+
"|---------|--------|----------------------|\n",
|
|
2361
|
+
"| **1c.10** | Cohen's d | Should correlate - both measure cohort separation. d assumes normality, IV handles non-linear. |\n",
|
|
2362
|
+
"| **1c.12** | Velocity effect sizes | High velocity d ā feature changes differently by cohort ā may show high IV |\n",
|
|
2363
|
+
"| **1c.13** | Momentum effect sizes | High momentum d ā behavioral change patterns differ ā may show high IV |\n",
|
|
2364
|
+
"| **1c.16** | CramƩr's V | For categorical features (IV/KS is for numeric) |\n",
|
|
2365
|
+
"\n",
|
|
2366
|
+
"**Validation:** Features with high Cohen's d (1c.10) should generally show high IV here. Disagreements may indicate non-linear relationships (IV captures) or outlier effects (KS captures).\n"
|
|
2367
|
+
]
|
|
2368
|
+
},
|
|
2369
|
+
{
|
|
2370
|
+
"cell_type": "code",
|
|
2371
|
+
"execution_count": null,
|
|
2372
|
+
"id": "7377af73",
|
|
2373
|
+
"metadata": {
|
|
2374
|
+
"execution": {
|
|
2375
|
+
"iopub.execute_input": "2026-02-02T13:01:15.243351Z",
|
|
2376
|
+
"iopub.status.busy": "2026-02-02T13:01:15.243235Z",
|
|
2377
|
+
"iopub.status.idle": "2026-02-02T13:01:15.373581Z",
|
|
2378
|
+
"shell.execute_reply": "2026-02-02T13:01:15.372631Z"
|
|
2379
|
+
},
|
|
2380
|
+
"papermill": {
|
|
2381
|
+
"duration": 0.196582,
|
|
2382
|
+
"end_time": "2026-02-02T13:01:15.374301",
|
|
2383
|
+
"exception": false,
|
|
2384
|
+
"start_time": "2026-02-02T13:01:15.177719",
|
|
2385
|
+
"status": "completed"
|
|
2386
|
+
},
|
|
2387
|
+
"tags": []
|
|
2388
|
+
},
|
|
2389
|
+
"outputs": [],
|
|
2390
|
+
"source": [
|
|
2391
|
+
"# Predictive Power Analysis (IV & KS)\n",
|
|
2392
|
+
"if ENTITY_COLUMN and TARGET_COLUMN and sparkline_cols:\n",
|
|
2393
|
+
" print(\"=\"*70)\n",
|
|
2394
|
+
" print(\"PREDICTIVE POWER ANALYSIS (IV & KS Statistics)\")\n",
|
|
2395
|
+
" print(\"=\"*70)\n",
|
|
2396
|
+
" \n",
|
|
2397
|
+
" if 'feature_analyzer' not in dir():\n",
|
|
2398
|
+
" feature_analyzer = TemporalFeatureAnalyzer(time_column=TIME_COLUMN, entity_column=ENTITY_COLUMN)\n",
|
|
2399
|
+
" \n",
|
|
2400
|
+
" analysis_cols = [c for c in sparkline_cols[:8] if c in df.columns]\n",
|
|
2401
|
+
" \n",
|
|
2402
|
+
" # Method handles aggregation to entity level internally\n",
|
|
2403
|
+
" iv_results = feature_analyzer.calculate_predictive_power(df, analysis_cols, TARGET_COLUMN)\n",
|
|
2404
|
+
" \n",
|
|
2405
|
+
" n_retained = df.groupby(ENTITY_COLUMN)[TARGET_COLUMN].first().sum()\n",
|
|
2406
|
+
" n_churned = df[ENTITY_COLUMN].nunique() - n_retained\n",
|
|
2407
|
+
" print(f\"Analyzing {len(analysis_cols)} features\")\n",
|
|
2408
|
+
" print(f\"Entities: {df[ENTITY_COLUMN].nunique():,} (Retained: {n_retained:,}, Churned: {n_churned:,})\")\n",
|
|
2409
|
+
" \n",
|
|
2410
|
+
" # Build visualization data\n",
|
|
2411
|
+
" iv_data = {col: {\"iv\": r.information_value, \"ks\": r.ks_statistic} for col, r in iv_results.items()}\n",
|
|
2412
|
+
" \n",
|
|
2413
|
+
" # Create IV/KS comparison chart\n",
|
|
2414
|
+
" import plotly.graph_objects as go\n",
|
|
2415
|
+
" from plotly.subplots import make_subplots\n",
|
|
2416
|
+
" \n",
|
|
2417
|
+
" features = list(iv_data.keys())\n",
|
|
2418
|
+
" ivs = [iv_data[f][\"iv\"] for f in features]\n",
|
|
2419
|
+
" kss = [iv_data[f][\"ks\"] for f in features]\n",
|
|
2420
|
+
" \n",
|
|
2421
|
+
" # Sort by IV\n",
|
|
2422
|
+
" sorted_idx = sorted(range(len(ivs)), key=lambda i: ivs[i], reverse=True)\n",
|
|
2423
|
+
" features = [features[i] for i in sorted_idx]\n",
|
|
2424
|
+
" ivs = [ivs[i] for i in sorted_idx]\n",
|
|
2425
|
+
" kss = [kss[i] for i in sorted_idx]\n",
|
|
2426
|
+
" \n",
|
|
2427
|
+
" fig = make_subplots(rows=1, cols=2, subplot_titles=[\"Information Value (IV)\", \"KS Statistic\"])\n",
|
|
2428
|
+
" \n",
|
|
2429
|
+
" # IV bars with threshold colors\n",
|
|
2430
|
+
" iv_colors = [\"#27ae60\" if v >= 0.1 else \"#f39c12\" if v >= 0.02 else \"#95a5a6\" for v in ivs]\n",
|
|
2431
|
+
" fig.add_trace(go.Bar(y=features, x=ivs, orientation=\"h\", marker_color=iv_colors, \n",
|
|
2432
|
+
" showlegend=False, hovertemplate=\"%{y}: IV=%{x:.3f}<extra></extra>\"), row=1, col=1)\n",
|
|
2433
|
+
" fig.add_vline(x=0.1, line_dash=\"dash\", line_color=\"#27ae60\", annotation_text=\"Strong\", row=1, col=1)\n",
|
|
2434
|
+
" fig.add_vline(x=0.02, line_dash=\"dash\", line_color=\"#f39c12\", annotation_text=\"Medium\", row=1, col=1)\n",
|
|
2435
|
+
" \n",
|
|
2436
|
+
" # KS bars\n",
|
|
2437
|
+
" ks_colors = [\"#3498db\" if v >= 0.3 else \"#85c1e9\" for v in kss]\n",
|
|
2438
|
+
" fig.add_trace(go.Bar(y=features, x=kss, orientation=\"h\", marker_color=ks_colors,\n",
|
|
2439
|
+
" showlegend=False, hovertemplate=\"%{y}: KS=%{x:.3f}<extra></extra>\"), row=1, col=2)\n",
|
|
2440
|
+
" \n",
|
|
2441
|
+
" fig.update_layout(height=400, title=\"Predictive Power: IV & KS Statistics\")\n",
|
|
2442
|
+
" fig.update_xaxes(title_text=\"IV\", row=1, col=1)\n",
|
|
2443
|
+
" fig.update_xaxes(title_text=\"KS\", row=1, col=2)\n",
|
|
2444
|
+
" display_figure(fig)\n",
|
|
2445
|
+
" \n",
|
|
2446
|
+
" # Details table\n",
|
|
2447
|
+
" print(\"\\nš Predictive Power Details:\")\n",
|
|
2448
|
+
" print(f\"{'Feature':<25} {'IV':>8} {'IV Strength':<12} {'KS':>8} {'KS Strength':<12}\")\n",
|
|
2449
|
+
" print(\"-\" * 70)\n",
|
|
2450
|
+
" for col in features:\n",
|
|
2451
|
+
" r = iv_results[col]\n",
|
|
2452
|
+
" print(f\"{col[:24]:<25} {r.information_value:>8.3f} {r.iv_interpretation:<12} {r.ks_statistic:>8.3f} {r.ks_interpretation:<12}\")\n",
|
|
2453
|
+
" \n",
|
|
2454
|
+
" # INTERPRETATION with cross-references\n",
|
|
2455
|
+
" print(\"\\n\" + \"ā\"*70)\n",
|
|
2456
|
+
" print(\"š INTERPRETATION\")\n",
|
|
2457
|
+
" print(\"ā\"*70)\n",
|
|
2458
|
+
" \n",
|
|
2459
|
+
" strong_iv = [col for col, r in iv_results.items() if r.information_value >= 0.1]\n",
|
|
2460
|
+
" medium_iv = [col for col, r in iv_results.items() if 0.02 <= r.information_value < 0.1]\n",
|
|
2461
|
+
" weak_iv = [col for col, r in iv_results.items() if r.information_value < 0.02]\n",
|
|
2462
|
+
" \n",
|
|
2463
|
+
" if strong_iv:\n",
|
|
2464
|
+
" print(f\"\\nStrong predictors (IV >= 0.1): {', '.join(strong_iv)}\")\n",
|
|
2465
|
+
" print(\" ā High predictive value, prioritize in model\")\n",
|
|
2466
|
+
" if medium_iv:\n",
|
|
2467
|
+
" print(f\"\\nMedium predictors (IV 0.02-0.1): {', '.join(medium_iv)}\")\n",
|
|
2468
|
+
" print(\" ā Useful signal, include in model\")\n",
|
|
2469
|
+
" if weak_iv:\n",
|
|
2470
|
+
" print(f\"\\nWeak predictors (IV < 0.02): {', '.join(weak_iv)}\")\n",
|
|
2471
|
+
" print(\" ā Limited predictive value alone\")\n",
|
|
2472
|
+
" \n",
|
|
2473
|
+
" # Cross-reference with 1c.10 Cohen's d if available\n",
|
|
2474
|
+
" if 'effect_df' in dir() and len(effect_df) > 0:\n",
|
|
2475
|
+
" print(\"\\nš Cross-reference with 1c.10 (Cohen's d):\")\n",
|
|
2476
|
+
" common_cols = [c for c in features if any(c in ec for ec in effect_df['feature'].tolist())]\n",
|
|
2477
|
+
" if common_cols:\n",
|
|
2478
|
+
" for col in common_cols[:3]:\n",
|
|
2479
|
+
" matching = effect_df[effect_df['feature'].str.contains(col, na=False)]\n",
|
|
2480
|
+
" if len(matching) > 0:\n",
|
|
2481
|
+
" d_val = matching.iloc[0]['cohens_d']\n",
|
|
2482
|
+
" iv_val = iv_results.get(col, None)\n",
|
|
2483
|
+
" if iv_val:\n",
|
|
2484
|
+
" agreement = \"ā agree\" if (abs(d_val) >= 0.5 and iv_val.information_value >= 0.1) or \\\n",
|
|
2485
|
+
" (abs(d_val) < 0.2 and iv_val.information_value < 0.02) else \"~ partial\"\n",
|
|
2486
|
+
" print(f\" {col}: d={d_val:+.2f}, IV={iv_val.information_value:.3f} {agreement}\")\n",
|
|
2487
|
+
" else:\n",
|
|
2488
|
+
" print(\" (No overlapping features to compare)\")\n",
|
|
2489
|
+
" \n",
|
|
2490
|
+
" # RECOMMENDATIONS\n",
|
|
2491
|
+
" print(\"\\n\" + \"ā\"*70)\n",
|
|
2492
|
+
" print(\"šÆ FEATURE RECOMMENDATIONS\")\n",
|
|
2493
|
+
" print(\"ā\"*70)\n",
|
|
2494
|
+
" \n",
|
|
2495
|
+
" if strong_iv:\n",
|
|
2496
|
+
" print(\"\\nš“ INCLUDE (Strong IV)\")\n",
|
|
2497
|
+
" for col in strong_iv:\n",
|
|
2498
|
+
" r = iv_results[col]\n",
|
|
2499
|
+
" print(f\" ⢠{col}: IV={r.information_value:.3f}, KS={r.ks_statistic:.3f}\")\n",
|
|
2500
|
+
" \n",
|
|
2501
|
+
" if medium_iv:\n",
|
|
2502
|
+
" print(\"\\nš” INCLUDE (Medium IV)\")\n",
|
|
2503
|
+
" for col in medium_iv[:5]:\n",
|
|
2504
|
+
" r = iv_results[col]\n",
|
|
2505
|
+
" print(f\" ⢠{col}: IV={r.information_value:.3f}\")\n",
|
|
2506
|
+
" \n",
|
|
2507
|
+
" if not strong_iv and not medium_iv:\n",
|
|
2508
|
+
" print(\"\\nNo features with strong predictive power found.\")\n",
|
|
2509
|
+
" print(\" ā See 1c.12 (velocity) and 1c.13 (momentum) for derived features\")\n",
|
|
2510
|
+
" print(\" ā See 1c.16 for categorical feature analysis\")\n",
|
|
2511
|
+
"else:\n",
|
|
2512
|
+
" print(\"Skipped: Requires entity column, target column, and numeric features\")\n",
|
|
2513
|
+
"\n",
|
|
2514
|
+
"# Store predictive power recommendations for pattern_summary\n",
|
|
2515
|
+
"PREDICTIVE_POWER_RECOMMENDATIONS = []\n",
|
|
2516
|
+
"if 'iv_results' in dir():\n",
|
|
2517
|
+
" for col, r in iv_results.items():\n",
|
|
2518
|
+
" if r.information_value >= 0.1:\n",
|
|
2519
|
+
" PREDICTIVE_POWER_RECOMMENDATIONS.append({\n",
|
|
2520
|
+
" \"action\": \"include_feature\", \"feature\": col,\n",
|
|
2521
|
+
" \"iv\": r.information_value, \"ks\": r.ks_statistic, \"priority\": \"high\",\n",
|
|
2522
|
+
" \"reason\": f\"Strong IV={r.information_value:.3f}, KS={r.ks_statistic:.3f}\",\n",
|
|
2523
|
+
" \"features\": [col]\n",
|
|
2524
|
+
" })\n",
|
|
2525
|
+
" elif r.information_value >= 0.02:\n",
|
|
2526
|
+
" PREDICTIVE_POWER_RECOMMENDATIONS.append({\n",
|
|
2527
|
+
" \"action\": \"include_feature\", \"feature\": col,\n",
|
|
2528
|
+
" \"iv\": r.information_value, \"ks\": r.ks_statistic, \"priority\": \"medium\",\n",
|
|
2529
|
+
" \"reason\": f\"Medium IV={r.information_value:.3f}\",\n",
|
|
2530
|
+
" \"features\": [col]\n",
|
|
2531
|
+
" })\n"
|
|
2532
|
+
]
|
|
2533
|
+
},
|
|
2534
|
+
{
|
|
2535
|
+
"cell_type": "markdown",
|
|
2536
|
+
"id": "8c8e8363",
|
|
2537
|
+
"metadata": {
|
|
2538
|
+
"papermill": {
|
|
2539
|
+
"duration": 0.065203,
|
|
2540
|
+
"end_time": "2026-02-02T13:01:15.505779",
|
|
2541
|
+
"exception": false,
|
|
2542
|
+
"start_time": "2026-02-02T13:01:15.440576",
|
|
2543
|
+
"status": "completed"
|
|
2544
|
+
},
|
|
2545
|
+
"tags": []
|
|
2546
|
+
},
|
|
2547
|
+
"source": [
|
|
2548
|
+
"## 1c.16 Categorical Feature Analysis\n",
|
|
2549
|
+
"\n",
|
|
2550
|
+
"**š What This Measures:**\n",
|
|
2551
|
+
"\n",
|
|
2552
|
+
"For each categorical feature, we analyze how its categories relate to the target (retention/churn):\n",
|
|
2553
|
+
"\n",
|
|
2554
|
+
"| Metric | What It Measures | How to Read |\n",
|
|
2555
|
+
"|--------|-----------------|-------------|\n",
|
|
2556
|
+
"| **CramƩr's V** | Overall association strength (0-1) | Higher = categories strongly predict target |\n",
|
|
2557
|
+
"| **High-Risk Categories** | Categories with target rate < 90% of average | These segments churn more |\n",
|
|
2558
|
+
"| **Low-Risk Categories** | Categories with target rate > 110% of average | These segments retain better |\n",
|
|
2559
|
+
"\n",
|
|
2560
|
+
"**Panel Guide:**\n",
|
|
2561
|
+
"\n",
|
|
2562
|
+
"| Panel | What It Shows | Color Scheme |\n",
|
|
2563
|
+
"|-------|---------------|--------------|\n",
|
|
2564
|
+
"| **Top-Left** | Feature ranking by CramĆ©r's V | š“ Strong ā„0.3 / š Moderate ā„0.1 / šµ Weak |\n",
|
|
2565
|
+
"| **Top-Right** | Count of features per effect bucket | š£ Purple gradient (darker = more significant bucket) |\n",
|
|
2566
|
+
"| **Bottom-Left** | High/low risk category counts | š“ High-risk (churn) / š¢ Low-risk (retain) |\n",
|
|
2567
|
+
"| **Bottom-Right** | Category breakdown (top feature) | š“ Below avg / š¢ Above avg / šµ Near avg |\n",
|
|
2568
|
+
"\n",
|
|
2569
|
+
"**Effect Strength Thresholds:**\n",
|
|
2570
|
+
"| CramƩr's V | Strength | Action |\n",
|
|
2571
|
+
"|------------|----------|--------|\n",
|
|
2572
|
+
"| ā„ 0.3 | Strong | Priority feature - include and consider interactions |\n",
|
|
2573
|
+
"| 0.15ā0.3 | Moderate | Include in model |\n",
|
|
2574
|
+
"| 0.05ā0.15 | Weak | May add noise, test impact |\n",
|
|
2575
|
+
"| < 0.05 | Negligible | Consider dropping |\n"
|
|
2576
|
+
]
|
|
2577
|
+
},
|
|
2578
|
+
{
|
|
2579
|
+
"cell_type": "code",
|
|
2580
|
+
"execution_count": null,
|
|
2581
|
+
"id": "89ba3170",
|
|
2582
|
+
"metadata": {
|
|
2583
|
+
"execution": {
|
|
2584
|
+
"iopub.execute_input": "2026-02-02T13:01:15.637541Z",
|
|
2585
|
+
"iopub.status.busy": "2026-02-02T13:01:15.637431Z",
|
|
2586
|
+
"iopub.status.idle": "2026-02-02T13:01:17.189978Z",
|
|
2587
|
+
"shell.execute_reply": "2026-02-02T13:01:17.189424Z"
|
|
2588
|
+
},
|
|
2589
|
+
"papermill": {
|
|
2590
|
+
"duration": 1.619747,
|
|
2591
|
+
"end_time": "2026-02-02T13:01:17.191361",
|
|
2592
|
+
"exception": false,
|
|
2593
|
+
"start_time": "2026-02-02T13:01:15.571614",
|
|
2594
|
+
"status": "completed"
|
|
2595
|
+
},
|
|
2596
|
+
"tags": []
|
|
2597
|
+
},
|
|
2598
|
+
"outputs": [],
|
|
2599
|
+
"source": [
|
|
2600
|
+
"# Categorical Feature Analysis\n",
|
|
2601
|
+
"from customer_retention.stages.profiling import analyze_categorical_features\n",
|
|
2602
|
+
"\n",
|
|
2603
|
+
"if ENTITY_COLUMN and TARGET_COLUMN:\n",
|
|
2604
|
+
" print(\"=\"*70)\n",
|
|
2605
|
+
" print(\"CATEGORICAL FEATURE ANALYSIS\")\n",
|
|
2606
|
+
" print(\"=\"*70)\n",
|
|
2607
|
+
" \n",
|
|
2608
|
+
" # Aggregate to entity level (take mode for categorical columns)\n",
|
|
2609
|
+
" cat_cols = [c for c in df.select_dtypes(include=['object', 'category']).columns \n",
|
|
2610
|
+
" if c not in [ENTITY_COLUMN, TIME_COLUMN, TARGET_COLUMN]]\n",
|
|
2611
|
+
" \n",
|
|
2612
|
+
" if cat_cols:\n",
|
|
2613
|
+
" entity_cats_df = df.groupby(ENTITY_COLUMN).agg(\n",
|
|
2614
|
+
" {c: lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else None for c in cat_cols}\n",
|
|
2615
|
+
" ).reset_index()\n",
|
|
2616
|
+
" entity_target = df.groupby(ENTITY_COLUMN)[TARGET_COLUMN].first().reset_index()\n",
|
|
2617
|
+
" entity_data = entity_cats_df.merge(entity_target, on=ENTITY_COLUMN)\n",
|
|
2618
|
+
" \n",
|
|
2619
|
+
" cat_result = analyze_categorical_features(entity_data, ENTITY_COLUMN, TARGET_COLUMN)\n",
|
|
2620
|
+
" \n",
|
|
2621
|
+
" print(f\"Features analyzed: {len(cat_result.feature_insights)}\")\n",
|
|
2622
|
+
" print(f\"Features filtered: {len(cat_result.filtered_columns)}\")\n",
|
|
2623
|
+
" print(f\"Overall target rate: {cat_result.overall_target_rate:.1%}\")\n",
|
|
2624
|
+
" \n",
|
|
2625
|
+
" if cat_result.feature_insights:\n",
|
|
2626
|
+
" # VISUALS\n",
|
|
2627
|
+
" fig = charts.categorical_analysis_panel(cat_result.feature_insights, cat_result.overall_target_rate)\n",
|
|
2628
|
+
" display_figure(fig)\n",
|
|
2629
|
+
" \n",
|
|
2630
|
+
" # DETAILS TABLE\n",
|
|
2631
|
+
" print(\"\\nš Feature Details:\")\n",
|
|
2632
|
+
" print(f\"{'Feature':<20} {'CramƩr V':>10} {'Effect':>12} {'#Cats':>7} {'High Risk':>10} {'Low Risk':>10}\")\n",
|
|
2633
|
+
" print(\"-\" * 75)\n",
|
|
2634
|
+
" for insight in cat_result.feature_insights:\n",
|
|
2635
|
+
" print(f\"{insight.feature_name[:19]:<20} {insight.cramers_v:>10.3f} {insight.effect_strength:>12} \"\n",
|
|
2636
|
+
" f\"{insight.n_categories:>7} {len(insight.high_risk_categories):>10} {len(insight.low_risk_categories):>10}\")\n",
|
|
2637
|
+
" \n",
|
|
2638
|
+
" # INTERPRETATION\n",
|
|
2639
|
+
" print(\"\\n\" + \"ā\"*70)\n",
|
|
2640
|
+
" print(\"š INTERPRETATION\")\n",
|
|
2641
|
+
" print(\"ā\"*70)\n",
|
|
2642
|
+
" \n",
|
|
2643
|
+
" strong = [i for i in cat_result.feature_insights if i.effect_strength == \"strong\"]\n",
|
|
2644
|
+
" moderate = [i for i in cat_result.feature_insights if i.effect_strength == \"moderate\"]\n",
|
|
2645
|
+
" weak = [i for i in cat_result.feature_insights if i.effect_strength in (\"weak\", \"negligible\")]\n",
|
|
2646
|
+
" \n",
|
|
2647
|
+
" if strong:\n",
|
|
2648
|
+
" print(f\"\\nStrong predictors ({len(strong)}): {', '.join(i.feature_name for i in strong)}\")\n",
|
|
2649
|
+
" print(\" ā These features have clear category-target relationships\")\n",
|
|
2650
|
+
" print(\" ā Include in model, consider one-hot encoding\")\n",
|
|
2651
|
+
" \n",
|
|
2652
|
+
" if moderate:\n",
|
|
2653
|
+
" print(f\"\\nModerate predictors ({len(moderate)}): {', '.join(i.feature_name for i in moderate)}\")\n",
|
|
2654
|
+
" print(\" ā Some predictive power, include if cardinality is reasonable\")\n",
|
|
2655
|
+
" \n",
|
|
2656
|
+
" if weak:\n",
|
|
2657
|
+
" print(f\"\\nWeak/negligible ({len(weak)}): {', '.join(i.feature_name for i in weak)}\")\n",
|
|
2658
|
+
" print(\" ā Limited predictive value, may add noise\")\n",
|
|
2659
|
+
" \n",
|
|
2660
|
+
" # High-risk category insights\n",
|
|
2661
|
+
" all_high_risk = [(i.feature_name, c) for i in cat_result.feature_insights for c in i.high_risk_categories[:2]]\n",
|
|
2662
|
+
" if all_high_risk:\n",
|
|
2663
|
+
" print(\"\\nHigh-risk segments (below-average retention):\")\n",
|
|
2664
|
+
" for feat, cat in all_high_risk[:5]:\n",
|
|
2665
|
+
" print(f\" ⢠{feat} = '{cat}'\")\n",
|
|
2666
|
+
" \n",
|
|
2667
|
+
" # RECOMMENDATIONS\n",
|
|
2668
|
+
" print(\"\\n\" + \"ā\"*70)\n",
|
|
2669
|
+
" print(\"šÆ FEATURE RECOMMENDATIONS\")\n",
|
|
2670
|
+
" print(\"ā\"*70)\n",
|
|
2671
|
+
" \n",
|
|
2672
|
+
" if cat_result.recommendations:\n",
|
|
2673
|
+
" for rec in cat_result.recommendations:\n",
|
|
2674
|
+
" priority_marker = \"š“\" if rec.get('priority') == 'high' else \"š”\"\n",
|
|
2675
|
+
" print(f\"\\n{priority_marker} {rec.get('action', 'RECOMMENDATION').upper()}\")\n",
|
|
2676
|
+
" print(f\" {rec.get('reason', '')}\")\n",
|
|
2677
|
+
" else:\n",
|
|
2678
|
+
" # Generate recommendations based on analysis\n",
|
|
2679
|
+
" if strong:\n",
|
|
2680
|
+
" print(\"\\nš“ INCLUDE STRONG PREDICTORS\")\n",
|
|
2681
|
+
" for i in strong:\n",
|
|
2682
|
+
" print(f\" ⢠{i.feature_name}: V={i.cramers_v:.3f}, {i.n_categories} categories\")\n",
|
|
2683
|
+
" \n",
|
|
2684
|
+
" if any(i.n_categories > 20 for i in cat_result.feature_insights):\n",
|
|
2685
|
+
" high_card = [i for i in cat_result.feature_insights if i.n_categories > 20]\n",
|
|
2686
|
+
" print(\"\\nš” HIGH CARDINALITY - CONSIDER GROUPING\")\n",
|
|
2687
|
+
" for i in high_card:\n",
|
|
2688
|
+
" print(f\" ⢠{i.feature_name}: {i.n_categories} categories ā group rare categories\")\n",
|
|
2689
|
+
" \n",
|
|
2690
|
+
" if not strong and not moderate:\n",
|
|
2691
|
+
" print(\"\\nNo strong categorical predictors found.\")\n",
|
|
2692
|
+
" print(\" ⢠Consider creating derived features (e.g., category combinations)\")\n",
|
|
2693
|
+
" print(\" ⢠Or focus on numeric/temporal features\")\n",
|
|
2694
|
+
" else:\n",
|
|
2695
|
+
" print(\"\\nNo categorical features passed filtering criteria.\")\n",
|
|
2696
|
+
" if cat_result.filtered_columns:\n",
|
|
2697
|
+
" print(\"Filtered out:\")\n",
|
|
2698
|
+
" for col in cat_result.filtered_columns[:5]:\n",
|
|
2699
|
+
" reason = cat_result.filter_reasons.get(col, \"unknown\")\n",
|
|
2700
|
+
" print(f\" ⢠{col}: {reason}\")\n",
|
|
2701
|
+
" else:\n",
|
|
2702
|
+
" print(\"No categorical columns found in dataset.\")\n",
|
|
2703
|
+
"else:\n",
|
|
2704
|
+
" print(\"Skipped: Requires both entity and target columns\")\n"
|
|
2705
|
+
]
|
|
2706
|
+
},
|
|
2707
|
+
{
|
|
2708
|
+
"cell_type": "markdown",
|
|
2709
|
+
"id": "9c426663",
|
|
2710
|
+
"metadata": {
|
|
2711
|
+
"papermill": {
|
|
2712
|
+
"duration": 0.066164,
|
|
2713
|
+
"end_time": "2026-02-02T13:01:17.361718",
|
|
2714
|
+
"exception": false,
|
|
2715
|
+
"start_time": "2026-02-02T13:01:17.295554",
|
|
2716
|
+
"status": "completed"
|
|
2717
|
+
},
|
|
2718
|
+
"tags": []
|
|
2719
|
+
},
|
|
2720
|
+
"source": [
|
|
2721
|
+
"## 1c.17 Feature Engineering Summary\n",
|
|
2722
|
+
"\n",
|
|
2723
|
+
"**š Feature Types with Configured Windows:**\n",
|
|
2724
|
+
"\n",
|
|
2725
|
+
"The table below shows feature formulas using windows derived from 01a findings.\n",
|
|
2726
|
+
"Run the next cell to see actual values for your data.\n"
|
|
2727
|
+
]
|
|
2728
|
+
},
|
|
2729
|
+
{
|
|
2730
|
+
"cell_type": "code",
|
|
2731
|
+
"execution_count": null,
|
|
2732
|
+
"id": "30ee4efd",
|
|
2733
|
+
"metadata": {
|
|
2734
|
+
"execution": {
|
|
2735
|
+
"iopub.execute_input": "2026-02-02T13:01:17.498129Z",
|
|
2736
|
+
"iopub.status.busy": "2026-02-02T13:01:17.497998Z",
|
|
2737
|
+
"iopub.status.idle": "2026-02-02T13:02:46.116785Z",
|
|
2738
|
+
"shell.execute_reply": "2026-02-02T13:02:46.116423Z"
|
|
2739
|
+
},
|
|
2740
|
+
"papermill": {
|
|
2741
|
+
"duration": 88.753341,
|
|
2742
|
+
"end_time": "2026-02-02T13:02:46.183612",
|
|
2743
|
+
"exception": false,
|
|
2744
|
+
"start_time": "2026-02-02T13:01:17.430271",
|
|
2745
|
+
"status": "completed"
|
|
2746
|
+
},
|
|
2747
|
+
"tags": []
|
|
2748
|
+
},
|
|
2749
|
+
"outputs": [],
|
|
2750
|
+
"source": [
|
|
2751
|
+
"# Feature Engineering Recommendations\n",
|
|
2752
|
+
"print(\"=\"*80)\n",
|
|
2753
|
+
"print(\"FEATURE ENGINEERING RECOMMENDATIONS\")\n",
|
|
2754
|
+
"print(\"=\"*80)\n",
|
|
2755
|
+
"\n",
|
|
2756
|
+
"# Display configured windows from pattern_config\n",
|
|
2757
|
+
"momentum_pairs = pattern_config.get_momentum_pairs()\n",
|
|
2758
|
+
"short_w = momentum_pairs[0][0] if momentum_pairs else 7\n",
|
|
2759
|
+
"long_w = momentum_pairs[0][1] if momentum_pairs else 30\n",
|
|
2760
|
+
"\n",
|
|
2761
|
+
"print(f\"\"\"\n",
|
|
2762
|
+
"āāāāāāāāāāāāāāāāāāā¬āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n",
|
|
2763
|
+
"ā Feature Type ā Formula (using configured windows) ā\n",
|
|
2764
|
+
"āāāāāāāāāāāāāāāāāāā¼āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā¤\n",
|
|
2765
|
+
"ā Velocity ā (value_now - value_{short_w}d_ago) / {short_w} ā\n",
|
|
2766
|
+
"ā Acceleration ā velocity_now - velocity_{short_w}d_ago ā\n",
|
|
2767
|
+
"ā Momentum ā mean_{short_w}d / mean_{long_w}d ā\n",
|
|
2768
|
+
"ā Lag ā df[col].shift(N) ā\n",
|
|
2769
|
+
"ā Rolling Mean ā df[col].rolling({short_w}).mean() ā\n",
|
|
2770
|
+
"ā Rolling Std ā df[col].rolling({long_w}).std() ā\n",
|
|
2771
|
+
"ā Ratio ā sum_{long_w}d / sum_all_time ā\n",
|
|
2772
|
+
"āāāāāāāāāāāāāāāāāāā“āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n",
|
|
2773
|
+
"\n",
|
|
2774
|
+
"Windows derived from 01a findings: {pattern_config.aggregation_windows}\n",
|
|
2775
|
+
"Velocity window: {pattern_config.velocity_window_days}d\n",
|
|
2776
|
+
"Momentum pairs: {momentum_pairs}\n",
|
|
2777
|
+
"\"\"\")\n",
|
|
2778
|
+
"\n",
|
|
2779
|
+
"# Framework recommendations (without target - event-level data)\n",
|
|
2780
|
+
"if 'feature_analyzer' in dir() and sparkline_cols:\n",
|
|
2781
|
+
" recommendations = feature_analyzer.get_feature_recommendations(\n",
|
|
2782
|
+
" df, value_columns=sparkline_cols, target_column=None\n",
|
|
2783
|
+
" )\n",
|
|
2784
|
+
" if recommendations:\n",
|
|
2785
|
+
" print(\"šÆ Framework Recommendations (based on temporal patterns):\")\n",
|
|
2786
|
+
" for rec in recommendations[:5]:\n",
|
|
2787
|
+
" print(f\" ⢠{rec.feature_type.value}: {rec.source_column} ā {rec.feature_name}\")\n",
|
|
2788
|
+
" print(f\" Formula: {rec.formula}\")\n",
|
|
2789
|
+
" print(f\" Rationale: {rec.rationale}\")\n",
|
|
2790
|
+
" print(\"\"\"\n",
|
|
2791
|
+
"š” Note: Target-based recommendations require entity-level data.\n",
|
|
2792
|
+
" Run notebook 01d first to aggregate, then 02 for target analysis.\n",
|
|
2793
|
+
"\"\"\")\n"
|
|
2794
|
+
]
|
|
2795
|
+
},
|
|
2796
|
+
{
|
|
2797
|
+
"cell_type": "code",
|
|
2798
|
+
"execution_count": null,
|
|
2799
|
+
"id": "f2d68de1",
|
|
2800
|
+
"metadata": {
|
|
2801
|
+
"execution": {
|
|
2802
|
+
"iopub.execute_input": "2026-02-02T13:02:46.317914Z",
|
|
2803
|
+
"iopub.status.busy": "2026-02-02T13:02:46.317801Z",
|
|
2804
|
+
"iopub.status.idle": "2026-02-02T13:02:46.322153Z",
|
|
2805
|
+
"shell.execute_reply": "2026-02-02T13:02:46.321771Z"
|
|
2806
|
+
},
|
|
2807
|
+
"papermill": {
|
|
2808
|
+
"duration": 0.071965,
|
|
2809
|
+
"end_time": "2026-02-02T13:02:46.322701",
|
|
2810
|
+
"exception": false,
|
|
2811
|
+
"start_time": "2026-02-02T13:02:46.250736",
|
|
2812
|
+
"status": "completed"
|
|
2813
|
+
},
|
|
2814
|
+
"tags": []
|
|
2815
|
+
},
|
|
2816
|
+
"outputs": [],
|
|
2817
|
+
"source": [
|
|
2818
|
+
"print(\"\\n\" + \"=\"*70)\n",
|
|
2819
|
+
"print(\"TEMPORAL PATTERN SUMMARY\")\n",
|
|
2820
|
+
"print(\"=\"*70)\n",
|
|
2821
|
+
"\n",
|
|
2822
|
+
"# Windows used\n",
|
|
2823
|
+
"print(f\"\\nāļø CONFIGURED WINDOWS: {pattern_config.aggregation_windows}\")\n",
|
|
2824
|
+
"print(f\" Velocity: {pattern_config.velocity_window_days}d | Momentum: {pattern_config.get_momentum_pairs()}\")\n",
|
|
2825
|
+
"\n",
|
|
2826
|
+
"# Trend summary\n",
|
|
2827
|
+
"print(f\"\\nš TREND:\")\n",
|
|
2828
|
+
"print(f\" Direction: {trend_result.direction.value}\")\n",
|
|
2829
|
+
"print(f\" Confidence: {trend_result.confidence}\")\n",
|
|
2830
|
+
"\n",
|
|
2831
|
+
"# Seasonality summary\n",
|
|
2832
|
+
"print(f\"\\nš SEASONALITY:\")\n",
|
|
2833
|
+
"if seasonality_results:\n",
|
|
2834
|
+
" for sr in seasonality_results[:2]:\n",
|
|
2835
|
+
" period_name = sr.period_name or f\"{sr.period}-day\"\n",
|
|
2836
|
+
" print(f\" {period_name.title()} pattern (strength: {sr.strength:.2f})\")\n",
|
|
2837
|
+
"else:\n",
|
|
2838
|
+
" print(\" No significant seasonality detected\")\n",
|
|
2839
|
+
"\n",
|
|
2840
|
+
"# Recency summary\n",
|
|
2841
|
+
"if ENTITY_COLUMN:\n",
|
|
2842
|
+
" print(f\"\\nā±ļø RECENCY:\")\n",
|
|
2843
|
+
" print(f\" Median: {recency_result.median_recency_days:.0f} days\")\n",
|
|
2844
|
+
" if recency_result.target_correlation:\n",
|
|
2845
|
+
" corr = recency_result.target_correlation\n",
|
|
2846
|
+
" print(f\" Target correlation: {corr:.3f} {'(strong signal)' if abs(corr) > 0.3 else ''}\")\n",
|
|
2847
|
+
"\n",
|
|
2848
|
+
"# Velocity summary (if computed)\n",
|
|
2849
|
+
"if 'velocity_summary' in dir() and velocity_summary:\n",
|
|
2850
|
+
" print(f\"\\nš VELOCITY ({pattern_config.velocity_window_days}d window):\")\n",
|
|
2851
|
+
" divergent = [col for col, v in velocity_summary.items() if v.get('divergent')]\n",
|
|
2852
|
+
" if divergent:\n",
|
|
2853
|
+
" print(f\" Divergent columns (retained vs churned): {divergent}\")\n",
|
|
2854
|
+
" else:\n",
|
|
2855
|
+
" print(\" No significant divergence between cohorts\")\n",
|
|
2856
|
+
"\n",
|
|
2857
|
+
"# Momentum summary (if computed)\n",
|
|
2858
|
+
"if 'momentum_data' in dir() and momentum_data:\n",
|
|
2859
|
+
" print(f\"\\nš MOMENTUM ({pattern_config.get_momentum_pairs()[0] if pattern_config.get_momentum_pairs() else 'N/A'}):\")\n",
|
|
2860
|
+
" if 'divergent_cols' in dir() and divergent_cols:\n",
|
|
2861
|
+
" # Filter out target to prevent misleading metadata\n",
|
|
2862
|
+
" filtered_divergent = [c for c in divergent_cols if c.lower() != TARGET_COLUMN.lower()] if TARGET_COLUMN else divergent_cols\n",
|
|
2863
|
+
" if filtered_divergent:\n",
|
|
2864
|
+
" print(f\" High-signal columns: {filtered_divergent}\")\n",
|
|
2865
|
+
" else:\n",
|
|
2866
|
+
" print(\" No significant momentum differences detected (target excluded)\")\n",
|
|
2867
|
+
" else:\n",
|
|
2868
|
+
" print(\" No significant momentum differences detected\")\n"
|
|
2869
|
+
]
|
|
2870
|
+
},
|
|
2871
|
+
{
|
|
2872
|
+
"cell_type": "code",
|
|
2873
|
+
"execution_count": null,
|
|
2874
|
+
"id": "09773d2b",
|
|
2875
|
+
"metadata": {
|
|
2876
|
+
"execution": {
|
|
2877
|
+
"iopub.execute_input": "2026-02-02T13:02:46.455210Z",
|
|
2878
|
+
"iopub.status.busy": "2026-02-02T13:02:46.455104Z",
|
|
2879
|
+
"iopub.status.idle": "2026-02-02T13:02:46.458067Z",
|
|
2880
|
+
"shell.execute_reply": "2026-02-02T13:02:46.457726Z"
|
|
2881
|
+
},
|
|
2882
|
+
"papermill": {
|
|
2883
|
+
"duration": 0.070553,
|
|
2884
|
+
"end_time": "2026-02-02T13:02:46.458653",
|
|
2885
|
+
"exception": false,
|
|
2886
|
+
"start_time": "2026-02-02T13:02:46.388100",
|
|
2887
|
+
"status": "completed"
|
|
2888
|
+
},
|
|
2889
|
+
"tags": []
|
|
2890
|
+
},
|
|
2891
|
+
"outputs": [],
|
|
2892
|
+
"source": [
|
|
2893
|
+
"# Feature engineering recommendations based on patterns\n",
|
|
2894
|
+
"print(\"\\n\" + \"=\"*70)\n",
|
|
2895
|
+
"print(\"RECOMMENDED TEMPORAL FEATURES\")\n",
|
|
2896
|
+
"print(\"=\"*70)\n",
|
|
2897
|
+
"\n",
|
|
2898
|
+
"print(\"\\n\\U0001f6e0\\ufe0f Based on detected patterns, consider these features:\\n\")\n",
|
|
2899
|
+
"\n",
|
|
2900
|
+
"print(\"1. RECENCY FEATURES:\")\n",
|
|
2901
|
+
"print(\" - days_since_last_event\")\n",
|
|
2902
|
+
"print(\" - log_days_since_last_event (if right-skewed)\")\n",
|
|
2903
|
+
"print(\" - recency_bucket (categorical: 0-7d, 8-30d, etc.)\")\n",
|
|
2904
|
+
"\n",
|
|
2905
|
+
"if seasonality_results:\n",
|
|
2906
|
+
" weekly = any(6 <= sr.period <= 8 for sr in seasonality_results)\n",
|
|
2907
|
+
" monthly = any(28 <= sr.period <= 32 for sr in seasonality_results)\n",
|
|
2908
|
+
" \n",
|
|
2909
|
+
" print(\"\\n2. SEASONALITY FEATURES:\")\n",
|
|
2910
|
+
" if weekly:\n",
|
|
2911
|
+
" print(\" - is_weekend (binary)\")\n",
|
|
2912
|
+
" print(\" - day_of_week_sin, day_of_week_cos (cyclical encoding)\")\n",
|
|
2913
|
+
" if monthly:\n",
|
|
2914
|
+
" print(\" - day_of_month\")\n",
|
|
2915
|
+
" print(\" - is_month_start, is_month_end\")\n",
|
|
2916
|
+
"\n",
|
|
2917
|
+
"print(\"\\n3. TREND-ADJUSTED FEATURES:\")\n",
|
|
2918
|
+
"if trend_result.direction in [TrendDirection.INCREASING, TrendDirection.DECREASING]:\n",
|
|
2919
|
+
" print(\" - event_count_recent_vs_overall (ratio)\")\n",
|
|
2920
|
+
" print(\" - activity_trend_direction (for each entity)\")\n",
|
|
2921
|
+
"else:\n",
|
|
2922
|
+
" print(\" - Standard time-window aggregations should work well\")\n",
|
|
2923
|
+
"\n",
|
|
2924
|
+
"print(\"\\n4. COHORT FEATURES:\")\n",
|
|
2925
|
+
"print(\" - cohort_month (categorical or ordinal)\")\n",
|
|
2926
|
+
"print(\" - tenure_days (days since first event)\")"
|
|
2927
|
+
]
|
|
2928
|
+
},
|
|
2929
|
+
{
|
|
2930
|
+
"cell_type": "markdown",
|
|
2931
|
+
"id": "532bcc5b",
|
|
2932
|
+
"metadata": {
|
|
2933
|
+
"papermill": {
|
|
2934
|
+
"duration": 0.067259,
|
|
2935
|
+
"end_time": "2026-02-02T13:02:46.591399",
|
|
2936
|
+
"exception": false,
|
|
2937
|
+
"start_time": "2026-02-02T13:02:46.524140",
|
|
2938
|
+
"status": "completed"
|
|
2939
|
+
},
|
|
2940
|
+
"tags": []
|
|
2941
|
+
},
|
|
2942
|
+
"source": [
|
|
2943
|
+
"## 1c.18 Save Pattern Analysis Results"
|
|
2944
|
+
]
|
|
2945
|
+
},
|
|
2946
|
+
{
|
|
2947
|
+
"cell_type": "code",
|
|
2948
|
+
"execution_count": null,
|
|
2949
|
+
"id": "31e14ee3",
|
|
2950
|
+
"metadata": {
|
|
2951
|
+
"execution": {
|
|
2952
|
+
"iopub.execute_input": "2026-02-02T13:02:46.757514Z",
|
|
2953
|
+
"iopub.status.busy": "2026-02-02T13:02:46.757379Z",
|
|
2954
|
+
"iopub.status.idle": "2026-02-02T13:02:46.788737Z",
|
|
2955
|
+
"shell.execute_reply": "2026-02-02T13:02:46.788264Z"
|
|
2956
|
+
},
|
|
2957
|
+
"papermill": {
|
|
2958
|
+
"duration": 0.09925,
|
|
2959
|
+
"end_time": "2026-02-02T13:02:46.789508",
|
|
2960
|
+
"exception": false,
|
|
2961
|
+
"start_time": "2026-02-02T13:02:46.690258",
|
|
2962
|
+
"status": "completed"
|
|
2963
|
+
},
|
|
2964
|
+
"tags": []
|
|
2965
|
+
},
|
|
2966
|
+
"outputs": [],
|
|
2967
|
+
"source": [
|
|
2968
|
+
"# Store pattern analysis results in findings with actionable recommendations\n",
|
|
2969
|
+
"pattern_summary = {\n",
|
|
2970
|
+
" \"windows_used\": {\n",
|
|
2971
|
+
" # Note: aggregation_windows already stored in ts_metadata.suggested_aggregations\n",
|
|
2972
|
+
" \"velocity_window\": pattern_config.velocity_window_days,\n",
|
|
2973
|
+
" \"momentum_pairs\": pattern_config.get_momentum_pairs(),\n",
|
|
2974
|
+
" },\n",
|
|
2975
|
+
" \"trend\": {\n",
|
|
2976
|
+
" \"direction\": trend_result.direction.value,\n",
|
|
2977
|
+
" \"strength\": trend_result.strength,\n",
|
|
2978
|
+
" \"confidence\": trend_result.confidence,\n",
|
|
2979
|
+
" \"recommendations\": TREND_RECOMMENDATIONS if 'TREND_RECOMMENDATIONS' in dir() else [],\n",
|
|
2980
|
+
" },\n",
|
|
2981
|
+
" \"seasonality\": {\n",
|
|
2982
|
+
" \"patterns\": [\n",
|
|
2983
|
+
" {\"period\": sr.period, \"name\": sr.period_name, \"strength\": sr.strength,\n",
|
|
2984
|
+
" \"window_aligned\": sr.period in window_lags if 'window_lags' in dir() else False}\n",
|
|
2985
|
+
" for sr in seasonality_results\n",
|
|
2986
|
+
" ],\n",
|
|
2987
|
+
" \"recommendations\": [],\n",
|
|
2988
|
+
" },\n",
|
|
2989
|
+
"}\n",
|
|
2990
|
+
"\n",
|
|
2991
|
+
"# Generate seasonality recommendations\n",
|
|
2992
|
+
"seasonality_recs = []\n",
|
|
2993
|
+
"if seasonality_results:\n",
|
|
2994
|
+
" strong_patterns = [sr for sr in seasonality_results if sr.strength > 0.5]\n",
|
|
2995
|
+
" moderate_patterns = [sr for sr in seasonality_results if 0.3 < sr.strength <= 0.5]\n",
|
|
2996
|
+
" \n",
|
|
2997
|
+
" for sr in seasonality_results:\n",
|
|
2998
|
+
" if sr.period == 7:\n",
|
|
2999
|
+
" seasonality_recs.append({\n",
|
|
3000
|
+
" \"action\": \"add_cyclical_feature\", \"feature\": \"day_of_week\", \"encoding\": \"sin_cos\",\n",
|
|
3001
|
+
" \"reason\": f\"Weekly pattern detected (strength={sr.strength:.2f})\"\n",
|
|
3002
|
+
" })\n",
|
|
3003
|
+
" elif sr.period in [28, 30, 31]:\n",
|
|
3004
|
+
" seasonality_recs.append({\n",
|
|
3005
|
+
" \"action\": \"add_cyclical_feature\", \"feature\": \"day_of_month\", \"encoding\": \"sin_cos\",\n",
|
|
3006
|
+
" \"reason\": f\"Monthly pattern detected (strength={sr.strength:.2f})\"\n",
|
|
3007
|
+
" })\n",
|
|
3008
|
+
" elif sr.period in [90, 91, 92]:\n",
|
|
3009
|
+
" seasonality_recs.append({\n",
|
|
3010
|
+
" \"action\": \"add_cyclical_feature\", \"feature\": \"quarter\", \"encoding\": \"sin_cos\",\n",
|
|
3011
|
+
" \"reason\": f\"Quarterly pattern detected (strength={sr.strength:.2f})\"\n",
|
|
3012
|
+
" })\n",
|
|
3013
|
+
" \n",
|
|
3014
|
+
" if strong_patterns:\n",
|
|
3015
|
+
" seasonality_recs.append({\n",
|
|
3016
|
+
" \"action\": \"consider_deseasonalization\", \"periods\": [sr.period for sr in strong_patterns],\n",
|
|
3017
|
+
" \"reason\": \"Strong seasonal patterns may dominate signal\"\n",
|
|
3018
|
+
" })\n",
|
|
3019
|
+
" \n",
|
|
3020
|
+
" if 'window_lags' in dir() and window_lags:\n",
|
|
3021
|
+
" aligned = [sr for sr in seasonality_results if sr.period in window_lags]\n",
|
|
3022
|
+
" if aligned:\n",
|
|
3023
|
+
" seasonality_recs.append({\n",
|
|
3024
|
+
" \"action\": \"window_captures_cycle\", \"windows\": [sr.period for sr in aligned],\n",
|
|
3025
|
+
" \"reason\": \"Aggregation window aligns with seasonal cycle\"\n",
|
|
3026
|
+
" })\n",
|
|
3027
|
+
" else:\n",
|
|
3028
|
+
" seasonality_recs.append({\n",
|
|
3029
|
+
" \"action\": \"window_partial_cycle\",\n",
|
|
3030
|
+
" \"detected_periods\": [sr.period for sr in seasonality_results], \"windows\": window_lags,\n",
|
|
3031
|
+
" \"reason\": \"Aggregation windows don't align with detected cycles\"\n",
|
|
3032
|
+
" })\n",
|
|
3033
|
+
"\n",
|
|
3034
|
+
"pattern_summary[\"seasonality\"][\"recommendations\"] = seasonality_recs\n",
|
|
3035
|
+
"\n",
|
|
3036
|
+
"# Add temporal pattern recommendations\n",
|
|
3037
|
+
"if 'TEMPORAL_PATTERN_RECOMMENDATIONS' in dir() and TEMPORAL_PATTERN_RECOMMENDATIONS:\n",
|
|
3038
|
+
" pattern_summary[\"temporal_patterns\"] = {\n",
|
|
3039
|
+
" \"patterns\": [{\"name\": r[\"pattern\"], \"variation\": r.get(\"variation\", 0), \"priority\": r[\"priority\"]} for r in TEMPORAL_PATTERN_RECOMMENDATIONS],\n",
|
|
3040
|
+
" \"recommendations\": [{\"pattern\": r[\"pattern\"], \"features\": r[\"features\"], \"priority\": r[\"priority\"], \"reason\": r[\"reason\"]} for r in TEMPORAL_PATTERN_RECOMMENDATIONS if r.get(\"features\")],\n",
|
|
3041
|
+
" }\n",
|
|
3042
|
+
"\n",
|
|
3043
|
+
"# Add recency analysis with recommendations\n",
|
|
3044
|
+
"if ENTITY_COLUMN:\n",
|
|
3045
|
+
" recency_data = {\n",
|
|
3046
|
+
" \"median_days\": recency_result.median_recency_days,\n",
|
|
3047
|
+
" \"target_correlation\": recency_result.target_correlation,\n",
|
|
3048
|
+
" }\n",
|
|
3049
|
+
" if recency_comparison:\n",
|
|
3050
|
+
" recency_data.update({\n",
|
|
3051
|
+
" \"effect_size\": recency_comparison.cohens_d,\n",
|
|
3052
|
+
" \"effect_interpretation\": recency_comparison.effect_interpretation,\n",
|
|
3053
|
+
" \"distribution_pattern\": recency_comparison.distribution_pattern,\n",
|
|
3054
|
+
" \"inflection_bucket\": recency_comparison.inflection_bucket,\n",
|
|
3055
|
+
" \"retained_median\": recency_comparison.retained_stats.median,\n",
|
|
3056
|
+
" \"churned_median\": recency_comparison.churned_stats.median,\n",
|
|
3057
|
+
" \"key_findings\": [{\"finding\": f.finding, \"metric\": f.metric_name, \"value\": f.metric_value} for f in recency_comparison.key_findings],\n",
|
|
3058
|
+
" \"recommendations\": RECENCY_RECOMMENDATIONS,\n",
|
|
3059
|
+
" })\n",
|
|
3060
|
+
" pattern_summary[\"recency\"] = recency_data\n",
|
|
3061
|
+
"\n",
|
|
3062
|
+
"# Add velocity results\n",
|
|
3063
|
+
"if 'velocity_summary' in dir() and velocity_summary:\n",
|
|
3064
|
+
" pattern_summary[\"velocity\"] = {col: {\"mean_velocity\": v[\"mean_velocity\"], \"direction\": v[\"direction\"]} for col, v in velocity_summary.items()}\n",
|
|
3065
|
+
"\n",
|
|
3066
|
+
"# Add momentum results\n",
|
|
3067
|
+
"if 'momentum_data' in dir() and momentum_data:\n",
|
|
3068
|
+
" pattern_summary[\"momentum\"] = {col: {\"retained\": v[\"retained\"], \"churned\": v[\"churned\"]} for col, v in momentum_data.items()}\n",
|
|
3069
|
+
" if 'divergent_cols' in dir():\n",
|
|
3070
|
+
" pattern_summary[\"momentum\"][\"_divergent_columns\"] = [c for c in divergent_cols if c.lower() != TARGET_COLUMN.lower()] if TARGET_COLUMN else divergent_cols\n",
|
|
3071
|
+
"\n",
|
|
3072
|
+
"# Add cohort analysis results\n",
|
|
3073
|
+
"if 'COHORT_RECOMMENDATIONS' in dir() and COHORT_RECOMMENDATIONS:\n",
|
|
3074
|
+
" pattern_summary[\"cohort\"] = {\"recommendations\": COHORT_RECOMMENDATIONS}\n",
|
|
3075
|
+
"\n",
|
|
3076
|
+
"\n",
|
|
3077
|
+
"# Add categorical analysis results\n",
|
|
3078
|
+
"if 'cat_result' in dir() and cat_result.feature_insights:\n",
|
|
3079
|
+
" pattern_summary[\"categorical\"] = {\n",
|
|
3080
|
+
" \"overall_target_rate\": cat_result.overall_target_rate,\n",
|
|
3081
|
+
" \"features_analyzed\": len(cat_result.feature_insights),\n",
|
|
3082
|
+
" \"columns_filtered\": len(cat_result.filtered_columns),\n",
|
|
3083
|
+
" \"insights\": [\n",
|
|
3084
|
+
" {\"feature\": i.feature_name, \"cramers_v\": i.cramers_v, \"effect_strength\": i.effect_strength,\n",
|
|
3085
|
+
" \"high_risk\": i.high_risk_categories[:3], \"low_risk\": i.low_risk_categories[:3]}\n",
|
|
3086
|
+
" for i in cat_result.feature_insights[:10]\n",
|
|
3087
|
+
" ],\n",
|
|
3088
|
+
" \"recommendations\": cat_result.recommendations,\n",
|
|
3089
|
+
" \"key_findings\": cat_result.key_findings,\n",
|
|
3090
|
+
" }\n",
|
|
3091
|
+
"\n",
|
|
3092
|
+
"# Add velocity analysis results and recommendations\n",
|
|
3093
|
+
"if 'VELOCITY_RECOMMENDATIONS' in dir() and VELOCITY_RECOMMENDATIONS:\n",
|
|
3094
|
+
" pattern_summary[\"velocity\"][\"recommendations\"] = VELOCITY_RECOMMENDATIONS\n",
|
|
3095
|
+
"\n",
|
|
3096
|
+
"# Add momentum recommendations (separate from momentum data which is already stored)\n",
|
|
3097
|
+
"if 'MOMENTUM_RECOMMENDATIONS' in dir() and MOMENTUM_RECOMMENDATIONS:\n",
|
|
3098
|
+
" if \"momentum\" not in pattern_summary:\n",
|
|
3099
|
+
" pattern_summary[\"momentum\"] = {}\n",
|
|
3100
|
+
" pattern_summary[\"momentum\"][\"recommendations\"] = MOMENTUM_RECOMMENDATIONS\n",
|
|
3101
|
+
"\n",
|
|
3102
|
+
"# Add lag correlation recommendations\n",
|
|
3103
|
+
"if 'LAG_RECOMMENDATIONS' in dir() and LAG_RECOMMENDATIONS:\n",
|
|
3104
|
+
" pattern_summary[\"lag\"] = {\"recommendations\": LAG_RECOMMENDATIONS}\n",
|
|
3105
|
+
"\n",
|
|
3106
|
+
"# Add sparkline analysis recommendations (trend, seasonality, scaling)\n",
|
|
3107
|
+
"if 'SPARKLINE_RECOMMENDATIONS' in dir() and SPARKLINE_RECOMMENDATIONS:\n",
|
|
3108
|
+
" pattern_summary[\"sparkline\"] = {\"recommendations\": SPARKLINE_RECOMMENDATIONS}\n",
|
|
3109
|
+
"\n",
|
|
3110
|
+
"# Add effect size recommendations (feature prioritization)\n",
|
|
3111
|
+
"if 'EFFECT_SIZE_RECOMMENDATIONS' in dir() and EFFECT_SIZE_RECOMMENDATIONS:\n",
|
|
3112
|
+
" pattern_summary[\"effect_size\"] = {\"recommendations\": EFFECT_SIZE_RECOMMENDATIONS}\n",
|
|
3113
|
+
"\n",
|
|
3114
|
+
"# Add predictive power recommendations (IV/KS based)\n",
|
|
3115
|
+
"if 'PREDICTIVE_POWER_RECOMMENDATIONS' in dir() and PREDICTIVE_POWER_RECOMMENDATIONS:\n",
|
|
3116
|
+
" pattern_summary[\"predictive_power\"] = {\"recommendations\": PREDICTIVE_POWER_RECOMMENDATIONS}\n",
|
|
3117
|
+
"\n",
|
|
3118
|
+
"# Generate feature flags for 01d aggregation\n",
|
|
3119
|
+
"# These flags tell 01d which optional features to include based on analysis results\n",
|
|
3120
|
+
"pattern_summary[\"feature_flags\"] = {\n",
|
|
3121
|
+
" \"include_recency\": (\n",
|
|
3122
|
+
" recency_comparison.cohens_d > 0.2 \n",
|
|
3123
|
+
" if 'recency_comparison' in dir() and recency_comparison \n",
|
|
3124
|
+
" else True\n",
|
|
3125
|
+
" ),\n",
|
|
3126
|
+
" \"include_tenure\": True, # Default on; could be derived from tenure analysis if available\n",
|
|
3127
|
+
" \"include_lifecycle_quadrant\": ts_meta.temporal_segmentation_recommendation is not None if 'ts_meta' in dir() else False,\n",
|
|
3128
|
+
" \"include_trend_features\": bool(pattern_summary.get(\"trend\", {}).get(\"recommendations\")),\n",
|
|
3129
|
+
" \"include_seasonality_features\": bool(pattern_summary.get(\"seasonality\", {}).get(\"recommendations\")),\n",
|
|
3130
|
+
" \"include_cohort_features\": not any(\n",
|
|
3131
|
+
" r.get(\"action\") == \"skip_cohort_features\"\n",
|
|
3132
|
+
" for r in pattern_summary.get(\"cohort\", {}).get(\"recommendations\", [])\n",
|
|
3133
|
+
" ),\n",
|
|
3134
|
+
"}\n",
|
|
3135
|
+
"\n",
|
|
3136
|
+
"# Save to findings\n",
|
|
3137
|
+
"if not findings.metadata:\n",
|
|
3138
|
+
" findings.metadata = {}\n",
|
|
3139
|
+
"findings.metadata[\"temporal_patterns\"] = pattern_summary\n",
|
|
3140
|
+
"findings.save(FINDINGS_PATH)\n",
|
|
3141
|
+
"\n",
|
|
3142
|
+
"print(f\"Pattern analysis saved to: {FINDINGS_PATH}\")\n",
|
|
3143
|
+
"print(f\"Saved sections: {list(pattern_summary.keys())}\")\n",
|
|
3144
|
+
"\n",
|
|
3145
|
+
"# Print recency recommendations\n",
|
|
3146
|
+
"if pattern_summary.get(\"recency\", {}).get(\"recommendations\"):\n",
|
|
3147
|
+
" recency_recs = pattern_summary[\"recency\"][\"recommendations\"]\n",
|
|
3148
|
+
" print(f\"\\nā±ļø RECENCY FEATURES TO ADD ({len(recency_recs)}):\")\n",
|
|
3149
|
+
" for rec in recency_recs:\n",
|
|
3150
|
+
" priority_icon = {\"high\": \"š“\", \"medium\": \"š”\", \"low\": \"š¢\"}.get(rec.get(\"priority\", \"medium\"), \"āŖ\")\n",
|
|
3151
|
+
" features = rec.get(\"features\", [])\n",
|
|
3152
|
+
" if features:\n",
|
|
3153
|
+
" print(f\" {priority_icon} [{rec['priority'].upper()}] {', '.join(features)}\")\n",
|
|
3154
|
+
" print(f\" {rec['reason']}\")\n",
|
|
3155
|
+
"\n",
|
|
3156
|
+
"# Print cohort recommendations\n",
|
|
3157
|
+
"if \"cohort\" in pattern_summary:\n",
|
|
3158
|
+
" cohort_recs = pattern_summary[\"cohort\"].get(\"recommendations\", [])\n",
|
|
3159
|
+
" feature_recs = [r for r in cohort_recs if r.get(\"features\")]\n",
|
|
3160
|
+
" skip_recs = [r for r in cohort_recs if r.get(\"action\") == \"skip_cohort_features\"]\n",
|
|
3161
|
+
" if skip_recs:\n",
|
|
3162
|
+
" print(f\"\\nš„ COHORT: Skip cohort features ({skip_recs[0]['reason']})\")\n",
|
|
3163
|
+
" elif feature_recs:\n",
|
|
3164
|
+
" print(f\"\\nš„ COHORT FEATURES TO ADD:\")\n",
|
|
3165
|
+
" for rec in feature_recs:\n",
|
|
3166
|
+
" print(f\" ⢠{', '.join(rec['features'])} ({rec['priority']} priority)\")\n",
|
|
3167
|
+
"\n",
|
|
3168
|
+
"# Print trend recommendations\n",
|
|
3169
|
+
"if pattern_summary.get(\"trend\", {}).get(\"recommendations\"):\n",
|
|
3170
|
+
" trend_recs = [r for r in pattern_summary[\"trend\"][\"recommendations\"] if r.get(\"features\")]\n",
|
|
3171
|
+
" if trend_recs:\n",
|
|
3172
|
+
" print(f\"\\nš TREND FEATURES TO ADD ({len(trend_recs)}):\")\n",
|
|
3173
|
+
" for rec in trend_recs:\n",
|
|
3174
|
+
" print(f\" ⢠{', '.join(rec['features'])} ({rec['priority']} priority)\")\n",
|
|
3175
|
+
"\n",
|
|
3176
|
+
"# Print temporal pattern recommendations\n",
|
|
3177
|
+
"if \"temporal_patterns\" in pattern_summary:\n",
|
|
3178
|
+
" tp_recs = pattern_summary[\"temporal_patterns\"].get(\"recommendations\", [])\n",
|
|
3179
|
+
" if tp_recs:\n",
|
|
3180
|
+
" print(f\"\\nš
TEMPORAL PATTERN FEATURES TO ADD ({len(tp_recs)}):\")\n",
|
|
3181
|
+
" for rec in tp_recs:\n",
|
|
3182
|
+
" print(f\" ⢠{rec['pattern']}: {', '.join(rec['features'])} ({rec['priority']} priority)\")\n",
|
|
3183
|
+
"\n",
|
|
3184
|
+
"\n",
|
|
3185
|
+
"# Print categorical recommendations\n",
|
|
3186
|
+
"if pattern_summary.get(\"categorical\", {}).get(\"recommendations\"):\n",
|
|
3187
|
+
" cat_recs = pattern_summary[\"categorical\"][\"recommendations\"]\n",
|
|
3188
|
+
" print(f\"\\nš·ļø CATEGORICAL FEATURE RECOMMENDATIONS ({len(cat_recs)}):\")\n",
|
|
3189
|
+
" for rec in cat_recs:\n",
|
|
3190
|
+
" priority_icon = {\"high\": \"š“\", \"medium\": \"š”\", \"low\": \"š¢\"}.get(rec.get(\"priority\", \"medium\"), \"āŖ\")\n",
|
|
3191
|
+
" features = rec.get(\"features\", [])\n",
|
|
3192
|
+
" if features:\n",
|
|
3193
|
+
" print(f\" {priority_icon} [{rec['priority'].upper()}] {rec['action']}\")\n",
|
|
3194
|
+
" print(f\" {rec['reason']}\")\n",
|
|
3195
|
+
"# Print seasonality recommendations\n",
|
|
3196
|
+
"if seasonality_recs:\n",
|
|
3197
|
+
" print(f\"\\nš SEASONALITY RECOMMENDATIONS ({len(seasonality_recs)}):\")\n",
|
|
3198
|
+
" for rec in seasonality_recs:\n",
|
|
3199
|
+
" action = rec[\"action\"].replace(\"_\", \" \").title()\n",
|
|
3200
|
+
" print(f\" ⢠{action}: {rec['reason']}\")\n"
|
|
3201
|
+
]
|
|
3202
|
+
},
|
|
3203
|
+
{
|
|
3204
|
+
"cell_type": "markdown",
|
|
3205
|
+
"id": "773f3e64",
|
|
3206
|
+
"metadata": {
|
|
3207
|
+
"papermill": {
|
|
3208
|
+
"duration": 0.067772,
|
|
3209
|
+
"end_time": "2026-02-02T13:02:46.924911",
|
|
3210
|
+
"exception": false,
|
|
3211
|
+
"start_time": "2026-02-02T13:02:46.857139",
|
|
3212
|
+
"status": "completed"
|
|
3213
|
+
},
|
|
3214
|
+
"tags": []
|
|
3215
|
+
},
|
|
3216
|
+
"source": [
|
|
3217
|
+
"---\n",
|
|
3218
|
+
"\n",
|
|
3219
|
+
"## Summary: What We Learned\n",
|
|
3220
|
+
"\n",
|
|
3221
|
+
"In this notebook, we analyzed temporal patterns:\n",
|
|
3222
|
+
"\n",
|
|
3223
|
+
"1. **Trend Detection** - Identified long-term direction in data\n",
|
|
3224
|
+
"2. **Seasonality** - Found periodic patterns (weekly, monthly)\n",
|
|
3225
|
+
"3. **Cohort Analysis** - Compared behavior by entity join date\n",
|
|
3226
|
+
"4. **Recency Analysis** - Measured how recent activity relates to outcomes\n",
|
|
3227
|
+
"5. **Feature Recommendations** - Generated feature engineering suggestions\n",
|
|
3228
|
+
"\n",
|
|
3229
|
+
"## Pattern Summary\n",
|
|
3230
|
+
"\n",
|
|
3231
|
+
"| Pattern | Status | Recommendation |\n",
|
|
3232
|
+
"|---------|--------|----------------|\n",
|
|
3233
|
+
"| Trend | Check findings | Detrend if strong |\n",
|
|
3234
|
+
"| Seasonality | Check findings | Add cyclical features |\n",
|
|
3235
|
+
"| Cohort Effects | Check findings | Add cohort indicators |\n",
|
|
3236
|
+
"| Recency Effects | Check findings | Prioritize recent windows |\n",
|
|
3237
|
+
"\n",
|
|
3238
|
+
"---\n",
|
|
3239
|
+
"\n",
|
|
3240
|
+
"## Next Steps\n",
|
|
3241
|
+
"\n",
|
|
3242
|
+
"**Complete the Event Bronze Track:**\n",
|
|
3243
|
+
"- **01d_event_aggregation.ipynb** - Aggregate events to entity-level (produces new dataset)\n",
|
|
3244
|
+
"\n",
|
|
3245
|
+
"After 01d produces the aggregated dataset, continue with:\n",
|
|
3246
|
+
"- **02_column_deep_dive.ipynb** - Profile aggregated feature distributions\n",
|
|
3247
|
+
"- **03_quality_assessment.ipynb** - Quality checks on aggregated data\n",
|
|
3248
|
+
"- **04_relationship_analysis.ipynb** - Feature correlations and relationships\n",
|
|
3249
|
+
"\n",
|
|
3250
|
+
"The aggregated data from 01d becomes the input for the Entity Bronze Track."
|
|
3251
|
+
]
|
|
3252
|
+
},
|
|
3253
|
+
{
|
|
3254
|
+
"cell_type": "markdown",
|
|
3255
|
+
"id": "aaab3af5",
|
|
3256
|
+
"metadata": {
|
|
3257
|
+
"papermill": {
|
|
3258
|
+
"duration": 0.068901,
|
|
3259
|
+
"end_time": "2026-02-02T13:02:47.115122",
|
|
3260
|
+
"exception": false,
|
|
3261
|
+
"start_time": "2026-02-02T13:02:47.046221",
|
|
3262
|
+
"status": "completed"
|
|
3263
|
+
},
|
|
3264
|
+
"tags": []
|
|
3265
|
+
},
|
|
3266
|
+
"source": [
|
|
3267
|
+
"> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
|
|
3268
|
+
"> The next notebook will automatically export this notebook's HTML documentation from the saved file."
|
|
3269
|
+
]
|
|
3270
|
+
}
|
|
3271
|
+
],
|
|
3272
|
+
"metadata": {
|
|
3273
|
+
"kernelspec": {
|
|
3274
|
+
"display_name": "Python 3",
|
|
3275
|
+
"language": "python",
|
|
3276
|
+
"name": "python3"
|
|
3277
|
+
},
|
|
3278
|
+
"language_info": {
|
|
3279
|
+
"codemirror_mode": {
|
|
3280
|
+
"name": "ipython",
|
|
3281
|
+
"version": 3
|
|
3282
|
+
},
|
|
3283
|
+
"file_extension": ".py",
|
|
3284
|
+
"mimetype": "text/x-python",
|
|
3285
|
+
"name": "python",
|
|
3286
|
+
"nbconvert_exporter": "python",
|
|
3287
|
+
"pygments_lexer": "ipython3",
|
|
3288
|
+
"version": "3.12.4"
|
|
3289
|
+
},
|
|
3290
|
+
"papermill": {
|
|
3291
|
+
"default_parameters": {},
|
|
3292
|
+
"duration": 99.332892,
|
|
3293
|
+
"end_time": "2026-02-02T13:02:47.701993",
|
|
3294
|
+
"environment_variables": {},
|
|
3295
|
+
"exception": null,
|
|
3296
|
+
"input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/01c_temporal_patterns.ipynb",
|
|
3297
|
+
"output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/01c_temporal_patterns.ipynb",
|
|
3298
|
+
"parameters": {},
|
|
3299
|
+
"start_time": "2026-02-02T13:01:08.369101",
|
|
3300
|
+
"version": "2.6.0"
|
|
3301
|
+
}
|
|
3302
|
+
},
|
|
3303
|
+
"nbformat": 4,
|
|
3304
|
+
"nbformat_minor": 5
|
|
3305
|
+
}
|