churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,1165 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "cell-0",
|
|
6
|
+
"metadata": {
|
|
7
|
+
"papermill": {
|
|
8
|
+
"duration": 0.00294,
|
|
9
|
+
"end_time": "2026-02-02T13:00:36.292705",
|
|
10
|
+
"exception": false,
|
|
11
|
+
"start_time": "2026-02-02T13:00:36.289765",
|
|
12
|
+
"status": "completed"
|
|
13
|
+
},
|
|
14
|
+
"tags": []
|
|
15
|
+
},
|
|
16
|
+
"source": [
|
|
17
|
+
"# Start Here: Data Discovery\n",
|
|
18
|
+
"\n",
|
|
19
|
+
"**Purpose:** Create a point-in-time snapshot and understand your dataset's structure through automatic profiling.\n",
|
|
20
|
+
"\n",
|
|
21
|
+
"**What you'll learn:**\n",
|
|
22
|
+
"- How to create temporally-safe training snapshots\n",
|
|
23
|
+
"- How automatic type inference works and when to override it\n",
|
|
24
|
+
"- How to identify entity-level vs event-level data\n",
|
|
25
|
+
"- How to set up your target column for downstream analysis\n",
|
|
26
|
+
"\n",
|
|
27
|
+
"**Outputs:**\n",
|
|
28
|
+
"- Point-in-time training snapshot (Parquet)\n",
|
|
29
|
+
"- Dataset overview (rows, columns, memory, format, structure)\n",
|
|
30
|
+
"- Automatic column type inference with confidence scores\n",
|
|
31
|
+
"- Saved exploration findings (YAML)\n",
|
|
32
|
+
"\n",
|
|
33
|
+
"---\n",
|
|
34
|
+
"\n",
|
|
35
|
+
"## How to Read This Notebook\n",
|
|
36
|
+
"\n",
|
|
37
|
+
"Each section includes:\n",
|
|
38
|
+
"- **📊 Charts** - Interactive Plotly visualizations\n",
|
|
39
|
+
"- **📖 Interpretation Guide** - How to read and understand the output\n",
|
|
40
|
+
"- **✅ Actions** - What to do based on the findings"
|
|
41
|
+
]
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"cell_type": "markdown",
|
|
45
|
+
"id": "cell-1",
|
|
46
|
+
"metadata": {
|
|
47
|
+
"papermill": {
|
|
48
|
+
"duration": 0.002082,
|
|
49
|
+
"end_time": "2026-02-02T13:00:36.297433",
|
|
50
|
+
"exception": false,
|
|
51
|
+
"start_time": "2026-02-02T13:00:36.295351",
|
|
52
|
+
"status": "completed"
|
|
53
|
+
},
|
|
54
|
+
"tags": []
|
|
55
|
+
},
|
|
56
|
+
"source": [
|
|
57
|
+
"## 1.1 Configuration\n",
|
|
58
|
+
"\n",
|
|
59
|
+
"Configure your data source and target column **before** running the notebook."
|
|
60
|
+
]
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"cell_type": "code",
|
|
64
|
+
"execution_count": null,
|
|
65
|
+
"id": "cell-2",
|
|
66
|
+
"metadata": {
|
|
67
|
+
"execution": {
|
|
68
|
+
"iopub.execute_input": "2026-02-02T13:00:36.302181Z",
|
|
69
|
+
"iopub.status.busy": "2026-02-02T13:00:36.302053Z",
|
|
70
|
+
"iopub.status.idle": "2026-02-02T13:00:38.169709Z",
|
|
71
|
+
"shell.execute_reply": "2026-02-02T13:00:38.168998Z"
|
|
72
|
+
},
|
|
73
|
+
"papermill": {
|
|
74
|
+
"duration": 1.871147,
|
|
75
|
+
"end_time": "2026-02-02T13:00:38.170509",
|
|
76
|
+
"exception": false,
|
|
77
|
+
"start_time": "2026-02-02T13:00:36.299362",
|
|
78
|
+
"status": "completed"
|
|
79
|
+
},
|
|
80
|
+
"tags": []
|
|
81
|
+
},
|
|
82
|
+
"outputs": [],
|
|
83
|
+
"source": [
|
|
84
|
+
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
85
|
+
"track_and_export_previous(\"01_data_discovery.ipynb\")\n",
|
|
86
|
+
"\n",
|
|
87
|
+
"from datetime import datetime\n",
|
|
88
|
+
"from pathlib import Path\n",
|
|
89
|
+
"\n",
|
|
90
|
+
"import pandas as pd\n",
|
|
91
|
+
"\n",
|
|
92
|
+
"from customer_retention.analysis.auto_explorer import DataExplorer\n",
|
|
93
|
+
"from customer_retention.analysis.auto_explorer.findings import TimeSeriesMetadata\n",
|
|
94
|
+
"from customer_retention.analysis.visualization import ChartBuilder, console, display_figure, display_table\n",
|
|
95
|
+
"from customer_retention.core.config.column_config import DatasetGranularity\n",
|
|
96
|
+
"from customer_retention.core.config.experiments import (\n",
|
|
97
|
+
" EXPERIMENTS_DIR,\n",
|
|
98
|
+
" FINDINGS_DIR, # noqa: F401 - required for test validation\n",
|
|
99
|
+
" OUTPUT_DIR,\n",
|
|
100
|
+
" setup_experiments_structure,\n",
|
|
101
|
+
")\n",
|
|
102
|
+
"from customer_retention.stages.profiling import TypeDetector\n",
|
|
103
|
+
"from customer_retention.stages.temporal import (\n",
|
|
104
|
+
" TEMPORAL_METADATA_COLS,\n",
|
|
105
|
+
" CutoffAnalyzer,\n",
|
|
106
|
+
" PointInTimeRegistry,\n",
|
|
107
|
+
" ScenarioDetector,\n",
|
|
108
|
+
" UnifiedDataPreparer,\n",
|
|
109
|
+
")\n",
|
|
110
|
+
"from customer_retention.stages.validation import TimeSeriesDetector"
|
|
111
|
+
]
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
"cell_type": "code",
|
|
115
|
+
"execution_count": null,
|
|
116
|
+
"id": "cell-3",
|
|
117
|
+
"metadata": {
|
|
118
|
+
"execution": {
|
|
119
|
+
"iopub.execute_input": "2026-02-02T13:00:38.175150Z",
|
|
120
|
+
"iopub.status.busy": "2026-02-02T13:00:38.175032Z",
|
|
121
|
+
"iopub.status.idle": "2026-02-02T13:00:38.178565Z",
|
|
122
|
+
"shell.execute_reply": "2026-02-02T13:00:38.178104Z"
|
|
123
|
+
},
|
|
124
|
+
"papermill": {
|
|
125
|
+
"duration": 0.006562,
|
|
126
|
+
"end_time": "2026-02-02T13:00:38.179180",
|
|
127
|
+
"exception": false,
|
|
128
|
+
"start_time": "2026-02-02T13:00:38.172618",
|
|
129
|
+
"status": "completed"
|
|
130
|
+
},
|
|
131
|
+
"tags": []
|
|
132
|
+
},
|
|
133
|
+
"outputs": [],
|
|
134
|
+
"source": [
|
|
135
|
+
"# =============================================================================\n",
|
|
136
|
+
"# CONFIGURATION - Set these before running\n",
|
|
137
|
+
"# =============================================================================\n",
|
|
138
|
+
"\n",
|
|
139
|
+
"# DATA_PATH: Path to your data file (CSV, Parquet, or Delta)\n",
|
|
140
|
+
"DATA_PATH = \"../tests/fixtures/customer_retention_retail.csv\"\n",
|
|
141
|
+
"\n",
|
|
142
|
+
"# TARGET_COLUMN: Your prediction target (set to None for auto-detection)\n",
|
|
143
|
+
"TARGET_COLUMN = \"unsubscribed\"\n",
|
|
144
|
+
"\n",
|
|
145
|
+
"# ENTITY_COLUMN: Customer/user ID column (set to None for auto-detection)\n",
|
|
146
|
+
"ENTITY_COLUMN = None\n",
|
|
147
|
+
"\n",
|
|
148
|
+
"# LABEL_WINDOW_DAYS: Days after last activity to derive label timestamp\n",
|
|
149
|
+
"# Used when no explicit label timestamp column exists (e.g., churn_date)\n",
|
|
150
|
+
"# Default: 180 days (6 months observation window)\n",
|
|
151
|
+
"LABEL_WINDOW_DAYS = 180\n",
|
|
152
|
+
"\n",
|
|
153
|
+
"# TIMESTAMP_CONFIG: Override auto-detection if needed (set to None for auto-detection)\n",
|
|
154
|
+
"# Example manual override:\n",
|
|
155
|
+
"# TIMESTAMP_CONFIG = TimestampConfig(\n",
|
|
156
|
+
"# strategy=TimestampStrategy.PRODUCTION,\n",
|
|
157
|
+
"# feature_timestamp_column=\"observation_date\",\n",
|
|
158
|
+
"# label_timestamp_column=\"churn_date\",\n",
|
|
159
|
+
"# )\n",
|
|
160
|
+
"TIMESTAMP_CONFIG = None\n",
|
|
161
|
+
"\n",
|
|
162
|
+
"# =============================================================================\n",
|
|
163
|
+
"# SAMPLE DATASETS (for learning/testing only)\n",
|
|
164
|
+
"# =============================================================================\n",
|
|
165
|
+
"# ENTITY-LEVEL (one row per customer):\n",
|
|
166
|
+
"# DATA_PATH = \"../tests/fixtures/customer_retention_retail.csv\"\n",
|
|
167
|
+
"# DATA_PATH = \"../tests/fixtures/bank_customer_churn.csv\"\n",
|
|
168
|
+
"# DATA_PATH = \"../tests/fixtures/netflix_customer_churn.csv\"\n",
|
|
169
|
+
"#\n",
|
|
170
|
+
"# EVENT-LEVEL (multiple rows per customer):\n",
|
|
171
|
+
"# DATA_PATH = \"../tests/fixtures/customer_transactions.csv\"\n",
|
|
172
|
+
"DATA_PATH = \"../tests/fixtures/customer_emails.csv\"\n",
|
|
173
|
+
"# =============================================================================\n",
|
|
174
|
+
"\n",
|
|
175
|
+
"# OUTPUT_DIR: All outputs go here (gitignored)\n",
|
|
176
|
+
"# OUTPUT_DIR imported from customer_retention.core.config.experiments\n",
|
|
177
|
+
"setup_experiments_structure() # Creates all experiment directories\n"
|
|
178
|
+
]
|
|
179
|
+
},
|
|
180
|
+
{
|
|
181
|
+
"cell_type": "markdown",
|
|
182
|
+
"id": "cell-4",
|
|
183
|
+
"metadata": {
|
|
184
|
+
"papermill": {
|
|
185
|
+
"duration": 0.001804,
|
|
186
|
+
"end_time": "2026-02-02T13:00:38.182826",
|
|
187
|
+
"exception": false,
|
|
188
|
+
"start_time": "2026-02-02T13:00:38.181022",
|
|
189
|
+
"status": "completed"
|
|
190
|
+
},
|
|
191
|
+
"tags": []
|
|
192
|
+
},
|
|
193
|
+
"source": [
|
|
194
|
+
"## 1.2 Load Data & Create Point-in-Time Snapshot\n",
|
|
195
|
+
"\n",
|
|
196
|
+
"**This is the critical first step.** We:\n",
|
|
197
|
+
"1. Load raw data\n",
|
|
198
|
+
"2. Detect temporal scenario (production timestamps, derived, or synthetic)\n",
|
|
199
|
+
"3. Create a versioned snapshot with `feature_timestamp` and `label_timestamp`\n",
|
|
200
|
+
"4. All subsequent analysis uses the snapshot data\n",
|
|
201
|
+
"\n",
|
|
202
|
+
"This ensures temporal integrity and prevents data leakage."
|
|
203
|
+
]
|
|
204
|
+
},
|
|
205
|
+
{
|
|
206
|
+
"cell_type": "code",
|
|
207
|
+
"execution_count": null,
|
|
208
|
+
"id": "cell-5",
|
|
209
|
+
"metadata": {
|
|
210
|
+
"execution": {
|
|
211
|
+
"iopub.execute_input": "2026-02-02T13:00:38.187106Z",
|
|
212
|
+
"iopub.status.busy": "2026-02-02T13:00:38.186994Z",
|
|
213
|
+
"iopub.status.idle": "2026-02-02T13:00:38.358866Z",
|
|
214
|
+
"shell.execute_reply": "2026-02-02T13:00:38.358388Z"
|
|
215
|
+
},
|
|
216
|
+
"papermill": {
|
|
217
|
+
"duration": 0.175034,
|
|
218
|
+
"end_time": "2026-02-02T13:00:38.359612",
|
|
219
|
+
"exception": false,
|
|
220
|
+
"start_time": "2026-02-02T13:00:38.184578",
|
|
221
|
+
"status": "completed"
|
|
222
|
+
},
|
|
223
|
+
"tags": []
|
|
224
|
+
},
|
|
225
|
+
"outputs": [],
|
|
226
|
+
"source": [
|
|
227
|
+
"# Load raw data\n",
|
|
228
|
+
"raw_df = pd.read_csv(DATA_PATH) if DATA_PATH.endswith('.csv') else pd.read_parquet(DATA_PATH)\n",
|
|
229
|
+
"\n",
|
|
230
|
+
"console.start_section()\n",
|
|
231
|
+
"console.header(\"Raw Data Loaded\")\n",
|
|
232
|
+
"console.metric(\"Source\", DATA_PATH)\n",
|
|
233
|
+
"console.metric(\"Rows\", f\"{len(raw_df):,}\")\n",
|
|
234
|
+
"console.metric(\"Columns\", len(raw_df.columns))\n",
|
|
235
|
+
"console.end_section()\n",
|
|
236
|
+
"\n",
|
|
237
|
+
"# Detect granularity and entity column\n",
|
|
238
|
+
"type_detector = TypeDetector()\n",
|
|
239
|
+
"granularity_result = type_detector.detect_granularity(raw_df)\n",
|
|
240
|
+
"entity_column = ENTITY_COLUMN or granularity_result.entity_column\n",
|
|
241
|
+
"\n",
|
|
242
|
+
"# Detect or use provided timestamp configuration\n",
|
|
243
|
+
"if TIMESTAMP_CONFIG:\n",
|
|
244
|
+
" ts_config = TIMESTAMP_CONFIG\n",
|
|
245
|
+
" scenario = \"MANUAL_OVERRIDE\"\n",
|
|
246
|
+
" discovery_result = None\n",
|
|
247
|
+
" console.info(f\"Using manual timestamp config: {ts_config.strategy.value}\")\n",
|
|
248
|
+
"else:\n",
|
|
249
|
+
" detector = ScenarioDetector(label_window_days=LABEL_WINDOW_DAYS)\n",
|
|
250
|
+
" scenario, ts_config, discovery_result = detector.detect(raw_df, TARGET_COLUMN)\n",
|
|
251
|
+
"\n",
|
|
252
|
+
"console.start_section()\n",
|
|
253
|
+
"console.header(\"Temporal Scenario Detection\")\n",
|
|
254
|
+
"console.metric(\"Scenario\", scenario)\n",
|
|
255
|
+
"console.metric(\"Strategy\", ts_config.strategy.value)\n",
|
|
256
|
+
"console.metric(\"Label Window\", f\"{LABEL_WINDOW_DAYS} days\")\n",
|
|
257
|
+
"\n",
|
|
258
|
+
"if discovery_result:\n",
|
|
259
|
+
" if discovery_result.feature_timestamp:\n",
|
|
260
|
+
" source_col = discovery_result.feature_timestamp.column_name\n",
|
|
261
|
+
" if discovery_result.feature_timestamp.is_derived:\n",
|
|
262
|
+
" console.metric(\"Feature Timestamp\", f\"derived from {discovery_result.feature_timestamp.source_columns}\")\n",
|
|
263
|
+
" else:\n",
|
|
264
|
+
" was_promoted = \"promoted\" in discovery_result.feature_timestamp.notes.lower()\n",
|
|
265
|
+
" if was_promoted:\n",
|
|
266
|
+
" console.metric(\"Feature Timestamp\", f\"{source_col} (auto-selected as latest activity)\")\n",
|
|
267
|
+
" else:\n",
|
|
268
|
+
" console.metric(\"Feature Timestamp\", f\"{source_col} (explicit match)\")\n",
|
|
269
|
+
"\n",
|
|
270
|
+
" if discovery_result.label_timestamp:\n",
|
|
271
|
+
" if discovery_result.label_timestamp.is_derived:\n",
|
|
272
|
+
" console.metric(\"Label Timestamp\", f\"derived: {discovery_result.label_timestamp.derivation_formula}\")\n",
|
|
273
|
+
" else:\n",
|
|
274
|
+
" console.metric(\"Label Timestamp\", f\"{discovery_result.label_timestamp.column_name} (explicit match)\")\n",
|
|
275
|
+
"\n",
|
|
276
|
+
" if \"datetime_ordering\" in discovery_result.discovery_report:\n",
|
|
277
|
+
" ordering = discovery_result.discovery_report[\"datetime_ordering\"]\n",
|
|
278
|
+
" if ordering:\n",
|
|
279
|
+
" console.info(f\"Datetime column ordering: {' → '.join(ordering)}\")\n",
|
|
280
|
+
"\n",
|
|
281
|
+
"console.end_section()"
|
|
282
|
+
]
|
|
283
|
+
},
|
|
284
|
+
{
|
|
285
|
+
"cell_type": "markdown",
|
|
286
|
+
"id": "jlzs6bfy2z",
|
|
287
|
+
"metadata": {
|
|
288
|
+
"papermill": {
|
|
289
|
+
"duration": 0.002056,
|
|
290
|
+
"end_time": "2026-02-02T13:00:38.363922",
|
|
291
|
+
"exception": false,
|
|
292
|
+
"start_time": "2026-02-02T13:00:38.361866",
|
|
293
|
+
"status": "completed"
|
|
294
|
+
},
|
|
295
|
+
"tags": []
|
|
296
|
+
},
|
|
297
|
+
"source": [
|
|
298
|
+
"### Cutoff Date Selection\n",
|
|
299
|
+
"\n",
|
|
300
|
+
"The chart below shows the temporal distribution of your data. Use it to select an appropriate cutoff date:\n",
|
|
301
|
+
"\n",
|
|
302
|
+
"- **Top chart**: Records per time bin and cumulative count\n",
|
|
303
|
+
"- **Bottom chart**: Train/Score split percentage at each potential cutoff date\n",
|
|
304
|
+
"- **Suggested cutoff** (blue dashed): Achieves ~90% train / 10% score split\n",
|
|
305
|
+
"\n",
|
|
306
|
+
"**Final data allocation:**\n",
|
|
307
|
+
"- Cutoff: 90% train, 10% score (holdout for final evaluation)\n",
|
|
308
|
+
"- Train/Test split: 89% train, 11% test (from the 90%)\n",
|
|
309
|
+
"- **Result: ~80% training, ~10% test, ~10% score**\n",
|
|
310
|
+
"\n",
|
|
311
|
+
"Adjust `CUTOFF_DATE` below if the suggested date doesn't fit your needs."
|
|
312
|
+
]
|
|
313
|
+
},
|
|
314
|
+
{
|
|
315
|
+
"cell_type": "code",
|
|
316
|
+
"execution_count": null,
|
|
317
|
+
"id": "lfuwm8yytw",
|
|
318
|
+
"metadata": {
|
|
319
|
+
"execution": {
|
|
320
|
+
"iopub.execute_input": "2026-02-02T13:00:38.368471Z",
|
|
321
|
+
"iopub.status.busy": "2026-02-02T13:00:38.368352Z",
|
|
322
|
+
"iopub.status.idle": "2026-02-02T13:00:38.411174Z",
|
|
323
|
+
"shell.execute_reply": "2026-02-02T13:00:38.410774Z"
|
|
324
|
+
},
|
|
325
|
+
"papermill": {
|
|
326
|
+
"duration": 0.046024,
|
|
327
|
+
"end_time": "2026-02-02T13:00:38.411683",
|
|
328
|
+
"exception": false,
|
|
329
|
+
"start_time": "2026-02-02T13:00:38.365659",
|
|
330
|
+
"status": "completed"
|
|
331
|
+
},
|
|
332
|
+
"tags": []
|
|
333
|
+
},
|
|
334
|
+
"outputs": [],
|
|
335
|
+
"source": [
|
|
336
|
+
"# Analyze temporal distribution for cutoff selection\n",
|
|
337
|
+
"from customer_retention.stages.temporal import DatetimeOrderAnalyzer\n",
|
|
338
|
+
"\n",
|
|
339
|
+
"cutoff_analyzer = CutoffAnalyzer()\n",
|
|
340
|
+
"cutoff_analysis = None\n",
|
|
341
|
+
"\n",
|
|
342
|
+
"# Derive last_action_date by coalescing all datetime columns (latest-median first)\n",
|
|
343
|
+
"datetime_order_analyzer = DatetimeOrderAnalyzer()\n",
|
|
344
|
+
"last_action_series = datetime_order_analyzer.derive_last_action_date(raw_df)\n",
|
|
345
|
+
"\n",
|
|
346
|
+
"# Fallback to feature_timestamp column if no datetime columns found\n",
|
|
347
|
+
"timestamp_col = None\n",
|
|
348
|
+
"if last_action_series is None:\n",
|
|
349
|
+
" if discovery_result and discovery_result.feature_timestamp:\n",
|
|
350
|
+
" if not discovery_result.feature_timestamp.is_derived:\n",
|
|
351
|
+
" timestamp_col = discovery_result.feature_timestamp.column_name\n",
|
|
352
|
+
"\n",
|
|
353
|
+
"# Check registry for existing cutoff\n",
|
|
354
|
+
"pit_registry = PointInTimeRegistry(OUTPUT_DIR)\n",
|
|
355
|
+
"registry_cutoff = pit_registry.check_consistency().reference_cutoff\n",
|
|
356
|
+
"\n",
|
|
357
|
+
"if last_action_series is not None:\n",
|
|
358
|
+
" cutoff_analysis = cutoff_analyzer.analyze(raw_df, timestamp_series=last_action_series, n_bins=50)\n",
|
|
359
|
+
" data_suggested_cutoff = cutoff_analysis.suggest_cutoff(train_ratio=0.9)\n",
|
|
360
|
+
"\n",
|
|
361
|
+
" console.start_section()\n",
|
|
362
|
+
" console.header(\"Cutoff Date Analysis\")\n",
|
|
363
|
+
" console.metric(\"Timestamp Source\", \"last_action_date (coalesced)\")\n",
|
|
364
|
+
" console.metric(\"Coverage\", f\"{cutoff_analysis.covered_rows:,} / {cutoff_analysis.source_rows:,} rows ({cutoff_analysis.coverage_ratio:.1%})\")\n",
|
|
365
|
+
" if cutoff_analysis.coverage_ratio < 0.95:\n",
|
|
366
|
+
" console.warning(\"Low timestamp coverage — consider filling missing dates\")\n",
|
|
367
|
+
" console.metric(\"Date Range\", f\"{cutoff_analysis.date_range[0].strftime('%Y-%m-%d')} to {cutoff_analysis.date_range[1].strftime('%Y-%m-%d')}\")\n",
|
|
368
|
+
" console.metric(\"Data-Suggested Cutoff\", data_suggested_cutoff.strftime(\"%Y-%m-%d\"))\n",
|
|
369
|
+
" split = cutoff_analysis.get_split_at_date(data_suggested_cutoff)\n",
|
|
370
|
+
" console.metric(\"At Suggested Split\", f\"{split['train_pct']:.0f}% train / {split['score_pct']:.0f}% score\")\n",
|
|
371
|
+
"\n",
|
|
372
|
+
" if registry_cutoff:\n",
|
|
373
|
+
" console.warning(f\"Registry has cutoff: {registry_cutoff.date()} (may be stale)\")\n",
|
|
374
|
+
" console.info(\"To clear: pit_registry.clear_registry()\")\n",
|
|
375
|
+
"\n",
|
|
376
|
+
" # Show milestones for reference\n",
|
|
377
|
+
" milestones = cutoff_analysis.get_percentage_milestones(step=10)\n",
|
|
378
|
+
" if milestones:\n",
|
|
379
|
+
" console.subheader(\"Reference Dates (10% intervals)\")\n",
|
|
380
|
+
" for m in milestones:\n",
|
|
381
|
+
" console.info(f\" {m['train_pct']:.0f}% train: {m['date'].strftime('%Y-%m-%d')}\")\n",
|
|
382
|
+
" console.end_section()\n",
|
|
383
|
+
"elif timestamp_col:\n",
|
|
384
|
+
" cutoff_analysis = cutoff_analyzer.analyze(raw_df, timestamp_column=timestamp_col, n_bins=50)\n",
|
|
385
|
+
" data_suggested_cutoff = cutoff_analysis.suggest_cutoff(train_ratio=0.9)\n",
|
|
386
|
+
"\n",
|
|
387
|
+
" console.start_section()\n",
|
|
388
|
+
" console.header(\"Cutoff Date Analysis\")\n",
|
|
389
|
+
" console.metric(\"Timestamp Column\", timestamp_col)\n",
|
|
390
|
+
" console.metric(\"Coverage\", f\"{cutoff_analysis.covered_rows:,} / {cutoff_analysis.source_rows:,} rows ({cutoff_analysis.coverage_ratio:.1%})\")\n",
|
|
391
|
+
" if cutoff_analysis.coverage_ratio < 0.95:\n",
|
|
392
|
+
" console.warning(\"Low timestamp coverage — consider filling missing dates\")\n",
|
|
393
|
+
" console.metric(\"Date Range\", f\"{cutoff_analysis.date_range[0].strftime('%Y-%m-%d')} to {cutoff_analysis.date_range[1].strftime('%Y-%m-%d')}\")\n",
|
|
394
|
+
" console.metric(\"Data-Suggested Cutoff\", data_suggested_cutoff.strftime(\"%Y-%m-%d\"))\n",
|
|
395
|
+
" split = cutoff_analysis.get_split_at_date(data_suggested_cutoff)\n",
|
|
396
|
+
" console.metric(\"At Suggested Split\", f\"{split['train_pct']:.0f}% train / {split['score_pct']:.0f}% score\")\n",
|
|
397
|
+
"\n",
|
|
398
|
+
" if registry_cutoff:\n",
|
|
399
|
+
" console.warning(f\"Registry has cutoff: {registry_cutoff.date()} (may be stale)\")\n",
|
|
400
|
+
" console.info(\"To clear: pit_registry.clear_registry()\")\n",
|
|
401
|
+
"\n",
|
|
402
|
+
" milestones = cutoff_analysis.get_percentage_milestones(step=10)\n",
|
|
403
|
+
" if milestones:\n",
|
|
404
|
+
" console.subheader(\"Reference Dates (10% intervals)\")\n",
|
|
405
|
+
" for m in milestones:\n",
|
|
406
|
+
" console.info(f\" {m['train_pct']:.0f}% train: {m['date'].strftime('%Y-%m-%d')}\")\n",
|
|
407
|
+
" console.end_section()\n",
|
|
408
|
+
"else:\n",
|
|
409
|
+
" data_suggested_cutoff = datetime.now()\n",
|
|
410
|
+
" console.start_section()\n",
|
|
411
|
+
" console.header(\"Cutoff Date Analysis\")\n",
|
|
412
|
+
" console.warning(\"No timestamp column detected\")\n",
|
|
413
|
+
" console.metric(\"Default Cutoff\", data_suggested_cutoff.strftime(\"%Y-%m-%d\"))\n",
|
|
414
|
+
" if registry_cutoff:\n",
|
|
415
|
+
" console.info(f\"Registry cutoff: {registry_cutoff.date()}\")\n",
|
|
416
|
+
" console.end_section()"
|
|
417
|
+
]
|
|
418
|
+
},
|
|
419
|
+
{
|
|
420
|
+
"cell_type": "code",
|
|
421
|
+
"execution_count": null,
|
|
422
|
+
"id": "ls2ezi1t5ag",
|
|
423
|
+
"metadata": {
|
|
424
|
+
"execution": {
|
|
425
|
+
"iopub.execute_input": "2026-02-02T13:00:38.416642Z",
|
|
426
|
+
"iopub.status.busy": "2026-02-02T13:00:38.416549Z",
|
|
427
|
+
"iopub.status.idle": "2026-02-02T13:00:38.546887Z",
|
|
428
|
+
"shell.execute_reply": "2026-02-02T13:00:38.546376Z"
|
|
429
|
+
},
|
|
430
|
+
"papermill": {
|
|
431
|
+
"duration": 0.133635,
|
|
432
|
+
"end_time": "2026-02-02T13:00:38.547468",
|
|
433
|
+
"exception": false,
|
|
434
|
+
"start_time": "2026-02-02T13:00:38.413833",
|
|
435
|
+
"status": "completed"
|
|
436
|
+
},
|
|
437
|
+
"tags": []
|
|
438
|
+
},
|
|
439
|
+
"outputs": [],
|
|
440
|
+
"source": [
|
|
441
|
+
"# =============================================================================\n",
|
|
442
|
+
"# CUTOFF DATE SELECTION - Set your preferred cutoff date\n",
|
|
443
|
+
"# =============================================================================\n",
|
|
444
|
+
"# Options:\n",
|
|
445
|
+
"# None = use data-suggested cutoff (~90/10 split)\n",
|
|
446
|
+
"# datetime(YYYY, M, D) = use specific date\n",
|
|
447
|
+
"#\n",
|
|
448
|
+
"# To clear stale registry: pit_registry.clear_registry()\n",
|
|
449
|
+
"# =============================================================================\n",
|
|
450
|
+
"CUTOFF_DATE = None # e.g., datetime(2017, 7, 1)\n",
|
|
451
|
+
"\n",
|
|
452
|
+
"# Compute final selected cutoff\n",
|
|
453
|
+
"selected_cutoff = CUTOFF_DATE or data_suggested_cutoff\n",
|
|
454
|
+
"\n",
|
|
455
|
+
"console.start_section()\n",
|
|
456
|
+
"console.header(\"Selected Cutoff Date\")\n",
|
|
457
|
+
"if CUTOFF_DATE:\n",
|
|
458
|
+
" console.info(f\"Manual override: {CUTOFF_DATE.strftime('%Y-%m-%d')}\")\n",
|
|
459
|
+
"else:\n",
|
|
460
|
+
" console.info(f\"Using data-suggested: {selected_cutoff.strftime('%Y-%m-%d')}\")\n",
|
|
461
|
+
"\n",
|
|
462
|
+
"if cutoff_analysis:\n",
|
|
463
|
+
" split = cutoff_analysis.get_split_at_date(selected_cutoff)\n",
|
|
464
|
+
" console.metric(\"Train/Score Split\", f\"{split['train_pct']:.0f}% / {split['score_pct']:.0f}%\")\n",
|
|
465
|
+
" console.metric(\"Train Records\", f\"{split['train_count']:,}\")\n",
|
|
466
|
+
" console.metric(\"Score Records\", f\"{split['score_count']:,}\")\n",
|
|
467
|
+
"console.end_section()\n",
|
|
468
|
+
"\n",
|
|
469
|
+
"# Display chart with selected cutoff\n",
|
|
470
|
+
"if cutoff_analysis:\n",
|
|
471
|
+
" chart_builder = ChartBuilder()\n",
|
|
472
|
+
" display_figure(chart_builder.cutoff_selection_chart(\n",
|
|
473
|
+
" cutoff_analysis,\n",
|
|
474
|
+
" suggested_cutoff=selected_cutoff,\n",
|
|
475
|
+
" current_cutoff=registry_cutoff\n",
|
|
476
|
+
" ))"
|
|
477
|
+
]
|
|
478
|
+
},
|
|
479
|
+
{
|
|
480
|
+
"cell_type": "code",
|
|
481
|
+
"execution_count": null,
|
|
482
|
+
"id": "cell-6",
|
|
483
|
+
"metadata": {
|
|
484
|
+
"execution": {
|
|
485
|
+
"iopub.execute_input": "2026-02-02T13:00:38.555852Z",
|
|
486
|
+
"iopub.status.busy": "2026-02-02T13:00:38.555746Z",
|
|
487
|
+
"iopub.status.idle": "2026-02-02T13:00:39.496304Z",
|
|
488
|
+
"shell.execute_reply": "2026-02-02T13:00:39.495731Z"
|
|
489
|
+
},
|
|
490
|
+
"papermill": {
|
|
491
|
+
"duration": 0.945295,
|
|
492
|
+
"end_time": "2026-02-02T13:00:39.496802",
|
|
493
|
+
"exception": false,
|
|
494
|
+
"start_time": "2026-02-02T13:00:38.551507",
|
|
495
|
+
"status": "completed"
|
|
496
|
+
},
|
|
497
|
+
"tags": []
|
|
498
|
+
},
|
|
499
|
+
"outputs": [],
|
|
500
|
+
"source": [
|
|
501
|
+
"# pit_registry already initialized in cutoff analysis cell\n",
|
|
502
|
+
"dataset_name = Path(DATA_PATH).stem\n",
|
|
503
|
+
"\n",
|
|
504
|
+
"# Use the user's selected cutoff (not forced by registry)\n",
|
|
505
|
+
"cutoff_date = selected_cutoff\n",
|
|
506
|
+
"\n",
|
|
507
|
+
"# Warn if overriding registry\n",
|
|
508
|
+
"if registry_cutoff and registry_cutoff.date() != selected_cutoff.date():\n",
|
|
509
|
+
" console.start_section()\n",
|
|
510
|
+
" console.header(\"Registry Update\")\n",
|
|
511
|
+
" console.warning(f\"Overriding registry cutoff ({registry_cutoff.date()}) with {selected_cutoff.date()}\")\n",
|
|
512
|
+
" console.info(\"All datasets in this project should use the same cutoff date\")\n",
|
|
513
|
+
" console.end_section()\n",
|
|
514
|
+
"\n",
|
|
515
|
+
"preparer = UnifiedDataPreparer(OUTPUT_DIR, ts_config)\n",
|
|
516
|
+
"df = preparer.prepare_from_raw(raw_df, target_column=TARGET_COLUMN, entity_column=entity_column or \"entity_id\")\n",
|
|
517
|
+
"\n",
|
|
518
|
+
"# Use the same last_action_series from cutoff analysis for snapshot splitting\n",
|
|
519
|
+
"# (do NOT re-derive on prepared df, which has extra timestamp columns)\n",
|
|
520
|
+
"snapshot_df, snapshot_metadata = preparer.create_training_snapshot(\n",
|
|
521
|
+
" df, cutoff_date, timestamp_series=last_action_series\n",
|
|
522
|
+
")\n",
|
|
523
|
+
"\n",
|
|
524
|
+
"pit_registry.register_snapshot(\n",
|
|
525
|
+
" dataset_name=dataset_name,\n",
|
|
526
|
+
" snapshot_id=snapshot_metadata['snapshot_id'],\n",
|
|
527
|
+
" cutoff_date=cutoff_date,\n",
|
|
528
|
+
" source_path=DATA_PATH,\n",
|
|
529
|
+
" row_count=snapshot_metadata['row_count']\n",
|
|
530
|
+
")\n",
|
|
531
|
+
"\n",
|
|
532
|
+
"console.start_section()\n",
|
|
533
|
+
"console.header(\"Point-in-Time Snapshot Created\")\n",
|
|
534
|
+
"console.metric(\"Dataset\", dataset_name)\n",
|
|
535
|
+
"console.metric(\"Snapshot ID\", snapshot_metadata['snapshot_id'])\n",
|
|
536
|
+
"console.metric(\"Rows\", f\"{snapshot_metadata['row_count']:,}\")\n",
|
|
537
|
+
"console.metric(\"Features\", len(snapshot_metadata['feature_columns']))\n",
|
|
538
|
+
"console.metric(\"Cutoff Date\", str(cutoff_date.date()))\n",
|
|
539
|
+
"console.metric(\"Data Hash\", snapshot_metadata['data_hash'][:16] + \"...\")\n",
|
|
540
|
+
"\n",
|
|
541
|
+
"# Sanity check: snapshot size should be consistent with cutoff analysis\n",
|
|
542
|
+
"if cutoff_analysis:\n",
|
|
543
|
+
" expected_split = cutoff_analysis.get_split_at_date(cutoff_date)\n",
|
|
544
|
+
" expected_train = expected_split['train_count']\n",
|
|
545
|
+
" actual_ratio = snapshot_metadata['row_count'] / len(df) * 100\n",
|
|
546
|
+
" console.metric(\"Split Ratio\", f\"{actual_ratio:.0f}% train / {100 - actual_ratio:.0f}% score\")\n",
|
|
547
|
+
" if abs(actual_ratio - expected_split['train_pct']) > 10:\n",
|
|
548
|
+
" console.error(\n",
|
|
549
|
+
" f\"SPLIT MISMATCH: snapshot has {snapshot_metadata['row_count']:,} rows \"\n",
|
|
550
|
+
" f\"({actual_ratio:.0f}%) but analysis expected {expected_train:,} ({expected_split['train_pct']:.0f}%)\"\n",
|
|
551
|
+
" )\n",
|
|
552
|
+
"\n",
|
|
553
|
+
"if \"feature_timestamp\" in df.columns:\n",
|
|
554
|
+
" console.success(\"Temporal columns added: feature_timestamp, label_timestamp\")\n",
|
|
555
|
+
"else:\n",
|
|
556
|
+
" console.warning(\"No temporal columns added (synthetic strategy)\")\n",
|
|
557
|
+
"\n",
|
|
558
|
+
"updated_report = pit_registry.check_consistency()\n",
|
|
559
|
+
"if updated_report.is_consistent:\n",
|
|
560
|
+
" console.success(f\"All {len(pit_registry.snapshots)} datasets use cutoff: {cutoff_date.date()}\")\n",
|
|
561
|
+
"else:\n",
|
|
562
|
+
" console.error(\"INCONSISTENT CUTOFF DATES DETECTED\")\n",
|
|
563
|
+
" console.warning(f\"Out of sync: {', '.join(updated_report.inconsistent_datasets)}\")\n",
|
|
564
|
+
" console.info(\"Re-run notebook 01 for out-of-sync datasets to align cutoff dates\")\n",
|
|
565
|
+
"\n",
|
|
566
|
+
"console.end_section()\n",
|
|
567
|
+
"\n",
|
|
568
|
+
"df = snapshot_df"
|
|
569
|
+
]
|
|
570
|
+
},
|
|
571
|
+
{
|
|
572
|
+
"cell_type": "markdown",
|
|
573
|
+
"id": "cell-7",
|
|
574
|
+
"metadata": {
|
|
575
|
+
"papermill": {
|
|
576
|
+
"duration": 0.003621,
|
|
577
|
+
"end_time": "2026-02-02T13:00:39.504319",
|
|
578
|
+
"exception": false,
|
|
579
|
+
"start_time": "2026-02-02T13:00:39.500698",
|
|
580
|
+
"status": "completed"
|
|
581
|
+
},
|
|
582
|
+
"tags": []
|
|
583
|
+
},
|
|
584
|
+
"source": [
|
|
585
|
+
"## 1.3 Dataset Exploration\n",
|
|
586
|
+
"\n",
|
|
587
|
+
"Now we explore the **snapshot data** (not raw data). This ensures all visualizations and metrics reflect the actual training data with temporal integrity."
|
|
588
|
+
]
|
|
589
|
+
},
|
|
590
|
+
{
|
|
591
|
+
"cell_type": "code",
|
|
592
|
+
"execution_count": null,
|
|
593
|
+
"id": "cell-8",
|
|
594
|
+
"metadata": {
|
|
595
|
+
"execution": {
|
|
596
|
+
"iopub.execute_input": "2026-02-02T13:00:39.512718Z",
|
|
597
|
+
"iopub.status.busy": "2026-02-02T13:00:39.512483Z",
|
|
598
|
+
"iopub.status.idle": "2026-02-02T13:00:40.268036Z",
|
|
599
|
+
"shell.execute_reply": "2026-02-02T13:00:40.267613Z"
|
|
600
|
+
},
|
|
601
|
+
"papermill": {
|
|
602
|
+
"duration": 0.760707,
|
|
603
|
+
"end_time": "2026-02-02T13:00:40.268855",
|
|
604
|
+
"exception": false,
|
|
605
|
+
"start_time": "2026-02-02T13:00:39.508148",
|
|
606
|
+
"status": "completed"
|
|
607
|
+
},
|
|
608
|
+
"tags": []
|
|
609
|
+
},
|
|
610
|
+
"outputs": [],
|
|
611
|
+
"source": [
|
|
612
|
+
"# Explore the snapshot data\n",
|
|
613
|
+
"# Note: UnifiedDataPreparer renames the target column to \"target\" in the snapshot\n",
|
|
614
|
+
"# So we use \"target\" as the hint, not the original TARGET_COLUMN name\n",
|
|
615
|
+
"explorer = DataExplorer(visualize=False, save_findings=True, output_dir=str(OUTPUT_DIR))\n",
|
|
616
|
+
"findings = explorer.explore(df, target_hint=\"target\", name=dataset_name)\n",
|
|
617
|
+
"findings.source_path = DATA_PATH\n",
|
|
618
|
+
"\n",
|
|
619
|
+
"# Store snapshot info in findings\n",
|
|
620
|
+
"findings.snapshot_id = snapshot_metadata['snapshot_id']\n",
|
|
621
|
+
"findings.snapshot_path = str(OUTPUT_DIR / \"snapshots\" / f\"{snapshot_metadata['snapshot_id']}.parquet\")\n",
|
|
622
|
+
"findings.timestamp_scenario = scenario\n",
|
|
623
|
+
"findings.timestamp_strategy = ts_config.strategy.value\n",
|
|
624
|
+
"\n",
|
|
625
|
+
"# Also store the original target column name for reference\n",
|
|
626
|
+
"findings.metadata[\"original_target_column\"] = TARGET_COLUMN\n",
|
|
627
|
+
"\n",
|
|
628
|
+
"granularity = \"event\" if granularity_result.granularity == DatasetGranularity.EVENT_LEVEL else \"entity\"\n",
|
|
629
|
+
"\n",
|
|
630
|
+
"# Display dataset overview\n",
|
|
631
|
+
"chart_builder = ChartBuilder()\n",
|
|
632
|
+
"display_figure(chart_builder.dataset_at_a_glance(\n",
|
|
633
|
+
" df, findings,\n",
|
|
634
|
+
" source_path=f\"Snapshot: {snapshot_metadata['snapshot_id']}\",\n",
|
|
635
|
+
" granularity=granularity,\n",
|
|
636
|
+
" max_columns=15,\n",
|
|
637
|
+
" columns_per_row=5\n",
|
|
638
|
+
"))"
|
|
639
|
+
]
|
|
640
|
+
},
|
|
641
|
+
{
|
|
642
|
+
"cell_type": "markdown",
|
|
643
|
+
"id": "cell-9",
|
|
644
|
+
"metadata": {
|
|
645
|
+
"papermill": {
|
|
646
|
+
"duration": 0.008723,
|
|
647
|
+
"end_time": "2026-02-02T13:00:40.285925",
|
|
648
|
+
"exception": false,
|
|
649
|
+
"start_time": "2026-02-02T13:00:40.277202",
|
|
650
|
+
"status": "completed"
|
|
651
|
+
},
|
|
652
|
+
"tags": []
|
|
653
|
+
},
|
|
654
|
+
"source": [
|
|
655
|
+
"## 1.4 Column Summary Table"
|
|
656
|
+
]
|
|
657
|
+
},
|
|
658
|
+
{
|
|
659
|
+
"cell_type": "code",
|
|
660
|
+
"execution_count": null,
|
|
661
|
+
"id": "cell-10",
|
|
662
|
+
"metadata": {
|
|
663
|
+
"execution": {
|
|
664
|
+
"iopub.execute_input": "2026-02-02T13:00:40.304451Z",
|
|
665
|
+
"iopub.status.busy": "2026-02-02T13:00:40.304338Z",
|
|
666
|
+
"iopub.status.idle": "2026-02-02T13:00:40.309685Z",
|
|
667
|
+
"shell.execute_reply": "2026-02-02T13:00:40.309252Z"
|
|
668
|
+
},
|
|
669
|
+
"papermill": {
|
|
670
|
+
"duration": 0.015804,
|
|
671
|
+
"end_time": "2026-02-02T13:00:40.310226",
|
|
672
|
+
"exception": false,
|
|
673
|
+
"start_time": "2026-02-02T13:00:40.294422",
|
|
674
|
+
"status": "completed"
|
|
675
|
+
},
|
|
676
|
+
"tags": []
|
|
677
|
+
},
|
|
678
|
+
"outputs": [],
|
|
679
|
+
"source": [
|
|
680
|
+
"# Exclude temporal metadata columns from summary\n",
|
|
681
|
+
"\n",
|
|
682
|
+
"summary_data = []\n",
|
|
683
|
+
"for name, col in findings.columns.items():\n",
|
|
684
|
+
" if name in TEMPORAL_METADATA_COLS:\n",
|
|
685
|
+
" continue\n",
|
|
686
|
+
" null_pct = col.universal_metrics.get(\"null_percentage\", 0)\n",
|
|
687
|
+
" distinct = col.universal_metrics.get(\"distinct_count\", \"N/A\")\n",
|
|
688
|
+
" summary_data.append({\n",
|
|
689
|
+
" \"Column\": name,\n",
|
|
690
|
+
" \"Type\": col.inferred_type.value,\n",
|
|
691
|
+
" \"Confidence\": f\"{col.confidence:.0%}\",\n",
|
|
692
|
+
" \"Nulls %\": f\"{null_pct:.1f}%\",\n",
|
|
693
|
+
" \"Distinct\": distinct,\n",
|
|
694
|
+
" \"Evidence\": col.evidence[0] if col.evidence else \"\"\n",
|
|
695
|
+
" })\n",
|
|
696
|
+
"\n",
|
|
697
|
+
"summary_df = pd.DataFrame(summary_data)\n",
|
|
698
|
+
"display_table(summary_df)"
|
|
699
|
+
]
|
|
700
|
+
},
|
|
701
|
+
{
|
|
702
|
+
"cell_type": "markdown",
|
|
703
|
+
"id": "cell-11",
|
|
704
|
+
"metadata": {
|
|
705
|
+
"papermill": {
|
|
706
|
+
"duration": 0.008735,
|
|
707
|
+
"end_time": "2026-02-02T13:00:40.327637",
|
|
708
|
+
"exception": false,
|
|
709
|
+
"start_time": "2026-02-02T13:00:40.318902",
|
|
710
|
+
"status": "completed"
|
|
711
|
+
},
|
|
712
|
+
"tags": []
|
|
713
|
+
},
|
|
714
|
+
"source": [
|
|
715
|
+
"## 1.5 Target Column Verification"
|
|
716
|
+
]
|
|
717
|
+
},
|
|
718
|
+
{
|
|
719
|
+
"cell_type": "code",
|
|
720
|
+
"execution_count": null,
|
|
721
|
+
"id": "cell-12",
|
|
722
|
+
"metadata": {
|
|
723
|
+
"execution": {
|
|
724
|
+
"iopub.execute_input": "2026-02-02T13:00:40.345703Z",
|
|
725
|
+
"iopub.status.busy": "2026-02-02T13:00:40.345579Z",
|
|
726
|
+
"iopub.status.idle": "2026-02-02T13:00:40.349522Z",
|
|
727
|
+
"shell.execute_reply": "2026-02-02T13:00:40.348868Z"
|
|
728
|
+
},
|
|
729
|
+
"papermill": {
|
|
730
|
+
"duration": 0.014034,
|
|
731
|
+
"end_time": "2026-02-02T13:00:40.350273",
|
|
732
|
+
"exception": false,
|
|
733
|
+
"start_time": "2026-02-02T13:00:40.336239",
|
|
734
|
+
"status": "completed"
|
|
735
|
+
},
|
|
736
|
+
"tags": []
|
|
737
|
+
},
|
|
738
|
+
"outputs": [],
|
|
739
|
+
"source": [
|
|
740
|
+
"console.start_section()\n",
|
|
741
|
+
"console.header(\"Target Column\")\n",
|
|
742
|
+
"\n",
|
|
743
|
+
"if findings.target_column and findings.target_column in df.columns:\n",
|
|
744
|
+
" console.success(f\"Target: {findings.target_column}\")\n",
|
|
745
|
+
" target_counts = df[findings.target_column].value_counts()\n",
|
|
746
|
+
" for val, count in target_counts.items():\n",
|
|
747
|
+
" pct = (count / len(df)) * 100\n",
|
|
748
|
+
" console.metric(f\"Class {val}\", f\"{count:,} ({pct:.1f}%)\")\n",
|
|
749
|
+
" if granularity_result.granularity == DatasetGranularity.EVENT_LEVEL:\n",
|
|
750
|
+
" console.info(\"Note: Event-level distribution (per row, not per entity)\")\n",
|
|
751
|
+
"else:\n",
|
|
752
|
+
" console.warning(\"No target column configured\")\n",
|
|
753
|
+
" console.info(\"Set TARGET_COLUMN in the configuration cell above\")\n",
|
|
754
|
+
"\n",
|
|
755
|
+
"console.end_section()"
|
|
756
|
+
]
|
|
757
|
+
},
|
|
758
|
+
{
|
|
759
|
+
"cell_type": "markdown",
|
|
760
|
+
"id": "cell-13",
|
|
761
|
+
"metadata": {
|
|
762
|
+
"papermill": {
|
|
763
|
+
"duration": 0.009232,
|
|
764
|
+
"end_time": "2026-02-02T13:00:40.368298",
|
|
765
|
+
"exception": false,
|
|
766
|
+
"start_time": "2026-02-02T13:00:40.359066",
|
|
767
|
+
"status": "completed"
|
|
768
|
+
},
|
|
769
|
+
"tags": []
|
|
770
|
+
},
|
|
771
|
+
"source": [
|
|
772
|
+
"## 1.6 Dataset Structure Detection"
|
|
773
|
+
]
|
|
774
|
+
},
|
|
775
|
+
{
|
|
776
|
+
"cell_type": "code",
|
|
777
|
+
"execution_count": null,
|
|
778
|
+
"id": "cell-14",
|
|
779
|
+
"metadata": {
|
|
780
|
+
"execution": {
|
|
781
|
+
"iopub.execute_input": "2026-02-02T13:00:40.400815Z",
|
|
782
|
+
"iopub.status.busy": "2026-02-02T13:00:40.400689Z",
|
|
783
|
+
"iopub.status.idle": "2026-02-02T13:00:40.754947Z",
|
|
784
|
+
"shell.execute_reply": "2026-02-02T13:00:40.754559Z"
|
|
785
|
+
},
|
|
786
|
+
"papermill": {
|
|
787
|
+
"duration": 0.363804,
|
|
788
|
+
"end_time": "2026-02-02T13:00:40.755594",
|
|
789
|
+
"exception": false,
|
|
790
|
+
"start_time": "2026-02-02T13:00:40.391790",
|
|
791
|
+
"status": "completed"
|
|
792
|
+
},
|
|
793
|
+
"tags": []
|
|
794
|
+
},
|
|
795
|
+
"outputs": [],
|
|
796
|
+
"source": [
|
|
797
|
+
"ts_detector = TimeSeriesDetector()\n",
|
|
798
|
+
"valid_entity_col = entity_column if entity_column and entity_column in df.columns else None\n",
|
|
799
|
+
"if not valid_entity_col:\n",
|
|
800
|
+
" for col in df.columns:\n",
|
|
801
|
+
" if any(p in col.lower() for p in [\"customer\", \"user\", \"entity\", \"account\"]) and \"id\" in col.lower():\n",
|
|
802
|
+
" if df[col].nunique() < len(df):\n",
|
|
803
|
+
" valid_entity_col = col\n",
|
|
804
|
+
" break\n",
|
|
805
|
+
"\n",
|
|
806
|
+
"detected_ts_col = \"feature_timestamp\" if \"feature_timestamp\" in df.columns else None\n",
|
|
807
|
+
"ts_characteristics = ts_detector.detect(df, entity_column=valid_entity_col, timestamp_column=detected_ts_col)\n",
|
|
808
|
+
"\n",
|
|
809
|
+
"console.start_section()\n",
|
|
810
|
+
"console.header(\"Dataset Structure\")\n",
|
|
811
|
+
"console.metric(\"Granularity\", granularity_result.granularity.value.upper())\n",
|
|
812
|
+
"if ts_characteristics.dataset_type.value != \"unknown\":\n",
|
|
813
|
+
" console.metric(\"Temporal Pattern\", ts_characteristics.dataset_type.value.upper())\n",
|
|
814
|
+
"console.metric(\"Entity Column\", valid_entity_col or entity_column or \"N/A\")\n",
|
|
815
|
+
"\n",
|
|
816
|
+
"if granularity_result.unique_entities:\n",
|
|
817
|
+
" console.metric(\"Unique Entities\", f\"{granularity_result.unique_entities:,}\")\n",
|
|
818
|
+
"if granularity_result.avg_events_per_entity:\n",
|
|
819
|
+
" console.metric(\"Avg Events/Entity\", f\"{granularity_result.avg_events_per_entity:.1f}\")\n",
|
|
820
|
+
"\n",
|
|
821
|
+
"is_event_level = granularity_result.granularity == DatasetGranularity.EVENT_LEVEL\n",
|
|
822
|
+
"if is_event_level:\n",
|
|
823
|
+
" console.info(\"EVENT-LEVEL DATA - Use Event Bronze Track:\")\n",
|
|
824
|
+
" console.info(\" -> 01a_temporal_deep_dive.ipynb\")\n",
|
|
825
|
+
" console.info(\" -> 01b_temporal_quality.ipynb\")\n",
|
|
826
|
+
" console.info(\" -> 01c_temporal_patterns.ipynb\")\n",
|
|
827
|
+
" console.info(\" -> 01d_event_aggregation.ipynb\")\n",
|
|
828
|
+
"else:\n",
|
|
829
|
+
" console.info(\"ENTITY-LEVEL DATA - Use standard flow:\")\n",
|
|
830
|
+
" console.info(\" -> 02_column_deep_dive.ipynb\")\n",
|
|
831
|
+
" console.info(\" -> 03_quality_assessment.ipynb\")\n",
|
|
832
|
+
"\n",
|
|
833
|
+
"console.end_section()"
|
|
834
|
+
]
|
|
835
|
+
},
|
|
836
|
+
{
|
|
837
|
+
"cell_type": "markdown",
|
|
838
|
+
"id": "5x5ypzy4cqg",
|
|
839
|
+
"metadata": {
|
|
840
|
+
"papermill": {
|
|
841
|
+
"duration": 0.008912,
|
|
842
|
+
"end_time": "2026-02-02T13:00:40.773089",
|
|
843
|
+
"exception": false,
|
|
844
|
+
"start_time": "2026-02-02T13:00:40.764177",
|
|
845
|
+
"status": "completed"
|
|
846
|
+
},
|
|
847
|
+
"tags": []
|
|
848
|
+
},
|
|
849
|
+
"source": [
|
|
850
|
+
"## 1.7 Feature Availability Analysis\n",
|
|
851
|
+
"\n",
|
|
852
|
+
"**Detect tracking changes over time:**\n",
|
|
853
|
+
"- **New tracking**: Variables that only exist from a certain date (new IT system introduced)\n",
|
|
854
|
+
"- **Retired tracking**: Variables that stop being tracked (system decommissioned)\n",
|
|
855
|
+
"- **Partial window**: Variables available only during a specific period\n",
|
|
856
|
+
"\n",
|
|
857
|
+
"This affects modeling because train/test splits should not cross availability boundaries."
|
|
858
|
+
]
|
|
859
|
+
},
|
|
860
|
+
{
|
|
861
|
+
"cell_type": "code",
|
|
862
|
+
"execution_count": null,
|
|
863
|
+
"id": "9wdocsw3wzj",
|
|
864
|
+
"metadata": {
|
|
865
|
+
"execution": {
|
|
866
|
+
"iopub.execute_input": "2026-02-02T13:00:40.791006Z",
|
|
867
|
+
"iopub.status.busy": "2026-02-02T13:00:40.790895Z",
|
|
868
|
+
"iopub.status.idle": "2026-02-02T13:00:40.815884Z",
|
|
869
|
+
"shell.execute_reply": "2026-02-02T13:00:40.815234Z"
|
|
870
|
+
},
|
|
871
|
+
"papermill": {
|
|
872
|
+
"duration": 0.035005,
|
|
873
|
+
"end_time": "2026-02-02T13:00:40.816633",
|
|
874
|
+
"exception": false,
|
|
875
|
+
"start_time": "2026-02-02T13:00:40.781628",
|
|
876
|
+
"status": "completed"
|
|
877
|
+
},
|
|
878
|
+
"tags": []
|
|
879
|
+
},
|
|
880
|
+
"outputs": [],
|
|
881
|
+
"source": [
|
|
882
|
+
"# Feature Availability Analysis - detect tracking changes\n",
|
|
883
|
+
"from customer_retention.analysis.auto_explorer.findings import FeatureAvailabilityInfo, FeatureAvailabilityMetadata\n",
|
|
884
|
+
"from customer_retention.stages.profiling import analyze_feature_availability\n",
|
|
885
|
+
"\n",
|
|
886
|
+
"time_col = detected_ts_col or ts_characteristics.timestamp_column\n",
|
|
887
|
+
"if time_col and time_col in df.columns:\n",
|
|
888
|
+
" exclude_cols = list(TEMPORAL_METADATA_COLS) + ([valid_entity_col] if valid_entity_col else [])\n",
|
|
889
|
+
" availability = analyze_feature_availability(df, time_col, exclude_columns=exclude_cols)\n",
|
|
890
|
+
" \n",
|
|
891
|
+
" console.start_section()\n",
|
|
892
|
+
" console.header(\"Feature Availability\")\n",
|
|
893
|
+
" console.metric(\"Data Span\", f\"{availability.data_start.date()} to {availability.data_end.date()} ({availability.time_span_days} days)\")\n",
|
|
894
|
+
" \n",
|
|
895
|
+
" if availability.new_tracking or availability.retired_tracking or availability.partial_window:\n",
|
|
896
|
+
" if availability.new_tracking:\n",
|
|
897
|
+
" console.warning(f\"New tracking ({len(availability.new_tracking)}): {', '.join(availability.new_tracking[:5])}\" + \n",
|
|
898
|
+
" (f\" +{len(availability.new_tracking)-5} more\" if len(availability.new_tracking) > 5 else \"\"))\n",
|
|
899
|
+
" if availability.retired_tracking:\n",
|
|
900
|
+
" console.warning(f\"Retired tracking ({len(availability.retired_tracking)}): {', '.join(availability.retired_tracking[:5])}\" +\n",
|
|
901
|
+
" (f\" +{len(availability.retired_tracking)-5} more\" if len(availability.retired_tracking) > 5 else \"\"))\n",
|
|
902
|
+
" if availability.partial_window:\n",
|
|
903
|
+
" console.warning(f\"Partial window ({len(availability.partial_window)}): {', '.join(availability.partial_window[:5])}\" +\n",
|
|
904
|
+
" (f\" +{len(availability.partial_window)-5} more\" if len(availability.partial_window) > 5 else \"\"))\n",
|
|
905
|
+
" \n",
|
|
906
|
+
" console.subheader(\"Recommendations\")\n",
|
|
907
|
+
" for rec in availability.recommendations[:5]:\n",
|
|
908
|
+
" if rec[\"column\"] != \"_general_\":\n",
|
|
909
|
+
" console.info(f\"{rec['column']}: {rec['reason']}\")\n",
|
|
910
|
+
" general_recs = [r for r in availability.recommendations if r[\"column\"] == \"_general_\"]\n",
|
|
911
|
+
" if general_recs:\n",
|
|
912
|
+
" console.warning(\"Train/test split should not cross feature availability boundaries\")\n",
|
|
913
|
+
" else:\n",
|
|
914
|
+
" console.success(\"All features have full temporal coverage\")\n",
|
|
915
|
+
" console.end_section()\n",
|
|
916
|
+
" \n",
|
|
917
|
+
" # Store structured availability metadata for downstream use\n",
|
|
918
|
+
" features_info = {\n",
|
|
919
|
+
" feat.column: FeatureAvailabilityInfo(\n",
|
|
920
|
+
" first_valid_date=feat.first_valid_date.isoformat()[:10] if feat.first_valid_date else None,\n",
|
|
921
|
+
" last_valid_date=feat.last_valid_date.isoformat()[:10] if feat.last_valid_date else None,\n",
|
|
922
|
+
" coverage_pct=feat.coverage_pct,\n",
|
|
923
|
+
" availability_type=feat.availability_type,\n",
|
|
924
|
+
" days_from_start=feat.days_from_start,\n",
|
|
925
|
+
" days_before_end=feat.days_before_end,\n",
|
|
926
|
+
" )\n",
|
|
927
|
+
" for feat in availability.features\n",
|
|
928
|
+
" }\n",
|
|
929
|
+
" findings.feature_availability = FeatureAvailabilityMetadata(\n",
|
|
930
|
+
" data_start=availability.data_start.isoformat()[:10],\n",
|
|
931
|
+
" data_end=availability.data_end.isoformat()[:10],\n",
|
|
932
|
+
" time_span_days=availability.time_span_days,\n",
|
|
933
|
+
" new_tracking=availability.new_tracking,\n",
|
|
934
|
+
" retired_tracking=availability.retired_tracking,\n",
|
|
935
|
+
" partial_window=availability.partial_window,\n",
|
|
936
|
+
" features=features_info,\n",
|
|
937
|
+
" )\n",
|
|
938
|
+
"else:\n",
|
|
939
|
+
" console.info(\"No timestamp column detected - skipping feature availability analysis\")"
|
|
940
|
+
]
|
|
941
|
+
},
|
|
942
|
+
{
|
|
943
|
+
"cell_type": "markdown",
|
|
944
|
+
"id": "cell-15",
|
|
945
|
+
"metadata": {
|
|
946
|
+
"papermill": {
|
|
947
|
+
"duration": 0.009128,
|
|
948
|
+
"end_time": "2026-02-02T13:00:40.834570",
|
|
949
|
+
"exception": false,
|
|
950
|
+
"start_time": "2026-02-02T13:00:40.825442",
|
|
951
|
+
"status": "completed"
|
|
952
|
+
},
|
|
953
|
+
"tags": []
|
|
954
|
+
},
|
|
955
|
+
"source": [
|
|
956
|
+
"## 1.8 Type Override (Optional)\n",
|
|
957
|
+
"\n",
|
|
958
|
+
"Override any incorrectly inferred column types before saving findings."
|
|
959
|
+
]
|
|
960
|
+
},
|
|
961
|
+
{
|
|
962
|
+
"cell_type": "code",
|
|
963
|
+
"execution_count": null,
|
|
964
|
+
"id": "cell-16",
|
|
965
|
+
"metadata": {
|
|
966
|
+
"execution": {
|
|
967
|
+
"iopub.execute_input": "2026-02-02T13:00:40.853044Z",
|
|
968
|
+
"iopub.status.busy": "2026-02-02T13:00:40.852915Z",
|
|
969
|
+
"iopub.status.idle": "2026-02-02T13:00:40.856702Z",
|
|
970
|
+
"shell.execute_reply": "2026-02-02T13:00:40.856113Z"
|
|
971
|
+
},
|
|
972
|
+
"papermill": {
|
|
973
|
+
"duration": 0.014054,
|
|
974
|
+
"end_time": "2026-02-02T13:00:40.857405",
|
|
975
|
+
"exception": false,
|
|
976
|
+
"start_time": "2026-02-02T13:00:40.843351",
|
|
977
|
+
"status": "completed"
|
|
978
|
+
},
|
|
979
|
+
"tags": []
|
|
980
|
+
},
|
|
981
|
+
"outputs": [],
|
|
982
|
+
"source": [
|
|
983
|
+
"# === TYPE OVERRIDES ===\n",
|
|
984
|
+
"TYPE_OVERRIDES = {\n",
|
|
985
|
+
" # \"column_name\": ColumnType.NEW_TYPE,\n",
|
|
986
|
+
"}\n",
|
|
987
|
+
"\n",
|
|
988
|
+
"console.start_section()\n",
|
|
989
|
+
"console.header(\"Type Override Review\")\n",
|
|
990
|
+
"\n",
|
|
991
|
+
"low_conf = [(name, col.inferred_type.value, col.confidence)\n",
|
|
992
|
+
" for name, col in findings.columns.items()\n",
|
|
993
|
+
" if col.confidence < 0.8 and name not in TEMPORAL_METADATA_COLS]\n",
|
|
994
|
+
"if low_conf:\n",
|
|
995
|
+
" console.subheader(\"Low Confidence Detections\")\n",
|
|
996
|
+
" for col_name, col_type, conf in sorted(low_conf, key=lambda x: x[2]):\n",
|
|
997
|
+
" console.warning(f\"{col_name}: {col_type} ({conf:.0%})\")\n",
|
|
998
|
+
"else:\n",
|
|
999
|
+
" console.success(\"All type detections have high confidence (>=80%)\")\n",
|
|
1000
|
+
"\n",
|
|
1001
|
+
"if TYPE_OVERRIDES:\n",
|
|
1002
|
+
" console.subheader(\"Applying Overrides\")\n",
|
|
1003
|
+
" for col_name, new_type in TYPE_OVERRIDES.items():\n",
|
|
1004
|
+
" if col_name in findings.columns:\n",
|
|
1005
|
+
" old_type = findings.columns[col_name].inferred_type.value\n",
|
|
1006
|
+
" findings.columns[col_name].inferred_type = new_type\n",
|
|
1007
|
+
" findings.columns[col_name].confidence = 1.0\n",
|
|
1008
|
+
" console.success(f\"{col_name}: {old_type} -> {new_type.value}\")\n",
|
|
1009
|
+
"\n",
|
|
1010
|
+
"console.end_section()"
|
|
1011
|
+
]
|
|
1012
|
+
},
|
|
1013
|
+
{
|
|
1014
|
+
"cell_type": "markdown",
|
|
1015
|
+
"id": "cell-17",
|
|
1016
|
+
"metadata": {
|
|
1017
|
+
"papermill": {
|
|
1018
|
+
"duration": 0.008987,
|
|
1019
|
+
"end_time": "2026-02-02T13:00:40.874685",
|
|
1020
|
+
"exception": false,
|
|
1021
|
+
"start_time": "2026-02-02T13:00:40.865698",
|
|
1022
|
+
"status": "completed"
|
|
1023
|
+
},
|
|
1024
|
+
"tags": []
|
|
1025
|
+
},
|
|
1026
|
+
"source": [
|
|
1027
|
+
"## 1.9 Save Findings"
|
|
1028
|
+
]
|
|
1029
|
+
},
|
|
1030
|
+
{
|
|
1031
|
+
"cell_type": "code",
|
|
1032
|
+
"execution_count": null,
|
|
1033
|
+
"id": "cell-18",
|
|
1034
|
+
"metadata": {
|
|
1035
|
+
"execution": {
|
|
1036
|
+
"iopub.execute_input": "2026-02-02T13:00:40.892705Z",
|
|
1037
|
+
"iopub.status.busy": "2026-02-02T13:00:40.892582Z",
|
|
1038
|
+
"iopub.status.idle": "2026-02-02T13:00:40.907134Z",
|
|
1039
|
+
"shell.execute_reply": "2026-02-02T13:00:40.906616Z"
|
|
1040
|
+
},
|
|
1041
|
+
"papermill": {
|
|
1042
|
+
"duration": 0.024129,
|
|
1043
|
+
"end_time": "2026-02-02T13:00:40.907614",
|
|
1044
|
+
"exception": false,
|
|
1045
|
+
"start_time": "2026-02-02T13:00:40.883485",
|
|
1046
|
+
"status": "completed"
|
|
1047
|
+
},
|
|
1048
|
+
"tags": []
|
|
1049
|
+
},
|
|
1050
|
+
"outputs": [],
|
|
1051
|
+
"source": [
|
|
1052
|
+
"# Populate time series metadata if event-level\n",
|
|
1053
|
+
"if is_event_level:\n",
|
|
1054
|
+
" # Use feature_timestamp if available (coalesced column), otherwise fall back to detected column\n",
|
|
1055
|
+
" snapshot_time_col = \"feature_timestamp\" if \"feature_timestamp\" in df.columns else (\n",
|
|
1056
|
+
" granularity_result.time_column or ts_characteristics.timestamp_column\n",
|
|
1057
|
+
" )\n",
|
|
1058
|
+
" findings.time_series_metadata = TimeSeriesMetadata(\n",
|
|
1059
|
+
" granularity=DatasetGranularity.EVENT_LEVEL,\n",
|
|
1060
|
+
" temporal_pattern=ts_characteristics.dataset_type.value,\n",
|
|
1061
|
+
" entity_column=entity_column,\n",
|
|
1062
|
+
" time_column=snapshot_time_col,\n",
|
|
1063
|
+
" avg_events_per_entity=granularity_result.avg_events_per_entity,\n",
|
|
1064
|
+
" time_span_days=int(ts_characteristics.time_span_days) if ts_characteristics.time_span_days else None,\n",
|
|
1065
|
+
" unique_entities=granularity_result.unique_entities,\n",
|
|
1066
|
+
" suggested_aggregations=[\"24h\", \"7d\", \"30d\", \"90d\", \"all_time\"]\n",
|
|
1067
|
+
" )\n",
|
|
1068
|
+
"\n",
|
|
1069
|
+
"FINDINGS_PATH = explorer.last_findings_path\n",
|
|
1070
|
+
"findings.save(FINDINGS_PATH)\n",
|
|
1071
|
+
"\n",
|
|
1072
|
+
"console.start_section()\n",
|
|
1073
|
+
"console.header(\"Findings Saved\")\n",
|
|
1074
|
+
"console.success(f\"Findings: {FINDINGS_PATH}\")\n",
|
|
1075
|
+
"console.success(f\"Snapshot: {findings.snapshot_path}\")\n",
|
|
1076
|
+
"console.metric(\"Columns\", findings.column_count)\n",
|
|
1077
|
+
"console.metric(\"Target\", findings.target_column or \"Not set\")\n",
|
|
1078
|
+
"console.metric(\"Snapshot ID\", findings.snapshot_id)\n",
|
|
1079
|
+
"console.metric(\"Timestamp Strategy\", findings.timestamp_strategy)\n",
|
|
1080
|
+
"console.end_section()\n"
|
|
1081
|
+
]
|
|
1082
|
+
},
|
|
1083
|
+
{
|
|
1084
|
+
"cell_type": "markdown",
|
|
1085
|
+
"id": "cell-19",
|
|
1086
|
+
"metadata": {
|
|
1087
|
+
"papermill": {
|
|
1088
|
+
"duration": 0.008551,
|
|
1089
|
+
"end_time": "2026-02-02T13:00:40.925353",
|
|
1090
|
+
"exception": false,
|
|
1091
|
+
"start_time": "2026-02-02T13:00:40.916802",
|
|
1092
|
+
"status": "completed"
|
|
1093
|
+
},
|
|
1094
|
+
"tags": []
|
|
1095
|
+
},
|
|
1096
|
+
"source": [
|
|
1097
|
+
"## 1.10 Summary\n",
|
|
1098
|
+
"\n",
|
|
1099
|
+
"**What was created:**\n",
|
|
1100
|
+
"- Point-in-time snapshot with `feature_timestamp` and `label_timestamp`\n",
|
|
1101
|
+
"- Exploration findings with column types and metrics\n",
|
|
1102
|
+
"\n",
|
|
1103
|
+
"**All downstream notebooks load the snapshot**, ensuring:\n",
|
|
1104
|
+
"- Temporal integrity (no data leakage)\n",
|
|
1105
|
+
"- Reproducibility (SHA256 hash verification)\n",
|
|
1106
|
+
"- Consistency (same data across all analysis)\n",
|
|
1107
|
+
"\n",
|
|
1108
|
+
"**Next steps:**\n",
|
|
1109
|
+
"- Entity-level data: `02_column_deep_dive.ipynb`\n",
|
|
1110
|
+
"- Event-level data: `01a_temporal_deep_dive.ipynb`"
|
|
1111
|
+
]
|
|
1112
|
+
},
|
|
1113
|
+
{
|
|
1114
|
+
"cell_type": "markdown",
|
|
1115
|
+
"id": "4c15f6a9",
|
|
1116
|
+
"metadata": {
|
|
1117
|
+
"papermill": {
|
|
1118
|
+
"duration": 0.008746,
|
|
1119
|
+
"end_time": "2026-02-02T13:00:40.942103",
|
|
1120
|
+
"exception": false,
|
|
1121
|
+
"start_time": "2026-02-02T13:00:40.933357",
|
|
1122
|
+
"status": "completed"
|
|
1123
|
+
},
|
|
1124
|
+
"tags": []
|
|
1125
|
+
},
|
|
1126
|
+
"source": [
|
|
1127
|
+
"> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
|
|
1128
|
+
"> The next notebook will automatically export this notebook's HTML documentation from the saved file."
|
|
1129
|
+
]
|
|
1130
|
+
}
|
|
1131
|
+
],
|
|
1132
|
+
"metadata": {
|
|
1133
|
+
"kernelspec": {
|
|
1134
|
+
"display_name": "Python 3",
|
|
1135
|
+
"language": "python",
|
|
1136
|
+
"name": "python3"
|
|
1137
|
+
},
|
|
1138
|
+
"language_info": {
|
|
1139
|
+
"codemirror_mode": {
|
|
1140
|
+
"name": "ipython",
|
|
1141
|
+
"version": 3
|
|
1142
|
+
},
|
|
1143
|
+
"file_extension": ".py",
|
|
1144
|
+
"mimetype": "text/x-python",
|
|
1145
|
+
"name": "python",
|
|
1146
|
+
"nbconvert_exporter": "python",
|
|
1147
|
+
"pygments_lexer": "ipython3",
|
|
1148
|
+
"version": "3.12.4"
|
|
1149
|
+
},
|
|
1150
|
+
"papermill": {
|
|
1151
|
+
"default_parameters": {},
|
|
1152
|
+
"duration": 5.708346,
|
|
1153
|
+
"end_time": "2026-02-02T13:00:41.468599",
|
|
1154
|
+
"environment_variables": {},
|
|
1155
|
+
"exception": null,
|
|
1156
|
+
"input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/01_data_discovery.ipynb",
|
|
1157
|
+
"output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/01_data_discovery.ipynb",
|
|
1158
|
+
"parameters": {},
|
|
1159
|
+
"start_time": "2026-02-02T13:00:35.760253",
|
|
1160
|
+
"version": "2.6.0"
|
|
1161
|
+
}
|
|
1162
|
+
},
|
|
1163
|
+
"nbformat": 4,
|
|
1164
|
+
"nbformat_minor": 5
|
|
1165
|
+
}
|