churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,647 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "cell-0",
|
|
6
|
+
"metadata": {
|
|
7
|
+
"papermill": {
|
|
8
|
+
"duration": 0.002321,
|
|
9
|
+
"end_time": "2026-02-02T13:00:34.189136",
|
|
10
|
+
"exception": false,
|
|
11
|
+
"start_time": "2026-02-02T13:00:34.186815",
|
|
12
|
+
"status": "completed"
|
|
13
|
+
},
|
|
14
|
+
"tags": []
|
|
15
|
+
},
|
|
16
|
+
"source": [
|
|
17
|
+
"# Start Here: Prerequisites\n",
|
|
18
|
+
"\n",
|
|
19
|
+
"> **No sample data required!** This framework works directly with your own CSV, Parquet, or Delta files. The datasets below are internal examples for learning - skip to **01_data_discovery.ipynb** if you have your own data.\n",
|
|
20
|
+
"\n",
|
|
21
|
+
"**Purpose:** Set up your environment and optionally download sample datasets for learning.\n",
|
|
22
|
+
"\n",
|
|
23
|
+
"**What you'll do:**\n",
|
|
24
|
+
"- Verify your Python environment\n",
|
|
25
|
+
"- (Optional) Set up Kaggle API credentials\n",
|
|
26
|
+
"- (Optional) Download sample churn datasets\n",
|
|
27
|
+
"\n",
|
|
28
|
+
"---"
|
|
29
|
+
]
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"cell_type": "markdown",
|
|
33
|
+
"id": "cell-1",
|
|
34
|
+
"metadata": {
|
|
35
|
+
"papermill": {
|
|
36
|
+
"duration": 0.001561,
|
|
37
|
+
"end_time": "2026-02-02T13:00:34.192541",
|
|
38
|
+
"exception": false,
|
|
39
|
+
"start_time": "2026-02-02T13:00:34.190980",
|
|
40
|
+
"status": "completed"
|
|
41
|
+
},
|
|
42
|
+
"tags": []
|
|
43
|
+
},
|
|
44
|
+
"source": [
|
|
45
|
+
"## 0.1 Verify Environment\n",
|
|
46
|
+
"\n",
|
|
47
|
+
"First, let's make sure the customer_retention package is installed."
|
|
48
|
+
]
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"cell_type": "code",
|
|
52
|
+
"execution_count": null,
|
|
53
|
+
"id": "cell-2",
|
|
54
|
+
"metadata": {
|
|
55
|
+
"execution": {
|
|
56
|
+
"iopub.execute_input": "2026-02-02T13:00:34.196468Z",
|
|
57
|
+
"iopub.status.busy": "2026-02-02T13:00:34.196335Z",
|
|
58
|
+
"iopub.status.idle": "2026-02-02T13:00:35.320889Z",
|
|
59
|
+
"shell.execute_reply": "2026-02-02T13:00:35.320316Z"
|
|
60
|
+
},
|
|
61
|
+
"papermill": {
|
|
62
|
+
"duration": 1.127344,
|
|
63
|
+
"end_time": "2026-02-02T13:00:35.321623",
|
|
64
|
+
"exception": false,
|
|
65
|
+
"start_time": "2026-02-02T13:00:34.194279",
|
|
66
|
+
"status": "completed"
|
|
67
|
+
},
|
|
68
|
+
"tags": []
|
|
69
|
+
},
|
|
70
|
+
"outputs": [],
|
|
71
|
+
"source": [
|
|
72
|
+
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
73
|
+
"track_and_export_previous(\"00_start_here.ipynb\")\n",
|
|
74
|
+
"\n",
|
|
75
|
+
"try:\n",
|
|
76
|
+
" import customer_retention\n",
|
|
77
|
+
" from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n",
|
|
78
|
+
" print(f\"customer_retention is installed\")\n",
|
|
79
|
+
"except ImportError:\n",
|
|
80
|
+
" print(\"customer_retention not found. Install with:\")\n",
|
|
81
|
+
" print(\" uv sync\")\n",
|
|
82
|
+
" print(\" # or: pip install -e .\")\n",
|
|
83
|
+
"from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS"
|
|
84
|
+
]
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
"cell_type": "markdown",
|
|
88
|
+
"id": "cell-3",
|
|
89
|
+
"metadata": {
|
|
90
|
+
"papermill": {
|
|
91
|
+
"duration": 0.0013,
|
|
92
|
+
"end_time": "2026-02-02T13:00:35.324448",
|
|
93
|
+
"exception": false,
|
|
94
|
+
"start_time": "2026-02-02T13:00:35.323148",
|
|
95
|
+
"status": "completed"
|
|
96
|
+
},
|
|
97
|
+
"tags": []
|
|
98
|
+
},
|
|
99
|
+
"source": [
|
|
100
|
+
"## 0.2 Available Datasets\n",
|
|
101
|
+
"\n",
|
|
102
|
+
"This framework includes several internal datasets for testing and learning. **You do not need any of these to use the framework with your own data.**\n",
|
|
103
|
+
"\n",
|
|
104
|
+
"### Entity-Level Datasets (one row per customer)\n",
|
|
105
|
+
"Use these with the standard exploration flow (notebooks 02, 03, 04).\n",
|
|
106
|
+
"\n",
|
|
107
|
+
"| Dataset | Status | Description |\n",
|
|
108
|
+
"|---------|--------|-------------|\n",
|
|
109
|
+
"| `customer_retention_retail.csv` | Included | Retail customer retention (~31K rows) |\n",
|
|
110
|
+
"| `bank_customer_churn.csv` | Download | Bank customer churn (~10K rows) |\n",
|
|
111
|
+
"| `netflix_customer_churn.csv` | Download | Netflix subscription churn (~10K rows) |\n",
|
|
112
|
+
"\n",
|
|
113
|
+
"### Event-Level Datasets (multiple rows per customer)\n",
|
|
114
|
+
"Use these with the Event Bronze Track (notebooks 01a, 01b, 01c, 01d).\n",
|
|
115
|
+
"\n",
|
|
116
|
+
"| Dataset | Status | Description |\n",
|
|
117
|
+
"|---------|--------|-------------|\n",
|
|
118
|
+
"| `customer_transactions.csv` | Included | Transaction events (~5K rows) |\n",
|
|
119
|
+
"| `customer_emails.csv` | Included | Email engagement events (large) |"
|
|
120
|
+
]
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
"cell_type": "code",
|
|
124
|
+
"execution_count": null,
|
|
125
|
+
"id": "cell-4",
|
|
126
|
+
"metadata": {
|
|
127
|
+
"execution": {
|
|
128
|
+
"iopub.execute_input": "2026-02-02T13:00:35.327665Z",
|
|
129
|
+
"iopub.status.busy": "2026-02-02T13:00:35.327501Z",
|
|
130
|
+
"iopub.status.idle": "2026-02-02T13:00:35.331134Z",
|
|
131
|
+
"shell.execute_reply": "2026-02-02T13:00:35.330708Z"
|
|
132
|
+
},
|
|
133
|
+
"papermill": {
|
|
134
|
+
"duration": 0.005895,
|
|
135
|
+
"end_time": "2026-02-02T13:00:35.331579",
|
|
136
|
+
"exception": false,
|
|
137
|
+
"start_time": "2026-02-02T13:00:35.325684",
|
|
138
|
+
"status": "completed"
|
|
139
|
+
},
|
|
140
|
+
"tags": []
|
|
141
|
+
},
|
|
142
|
+
"outputs": [],
|
|
143
|
+
"source": [
|
|
144
|
+
"from pathlib import Path\n",
|
|
145
|
+
"\n",
|
|
146
|
+
"FIXTURES_DIR = Path(\"../tests/fixtures\")\n",
|
|
147
|
+
"\n",
|
|
148
|
+
"# Entity-level datasets\n",
|
|
149
|
+
"entity_datasets = {\n",
|
|
150
|
+
" \"customer_retention_retail.csv\": \"Included\",\n",
|
|
151
|
+
" \"bank_customer_churn.csv\": \"Download from Kaggle\",\n",
|
|
152
|
+
" \"netflix_customer_churn.csv\": \"Download from Kaggle\",\n",
|
|
153
|
+
"}\n",
|
|
154
|
+
"\n",
|
|
155
|
+
"# Event-level datasets (internal)\n",
|
|
156
|
+
"event_datasets = {\n",
|
|
157
|
+
" \"customer_transactions.csv\": \"Included\",\n",
|
|
158
|
+
" \"customer_emails.csv\": \"Included\",\n",
|
|
159
|
+
"}\n",
|
|
160
|
+
"\n",
|
|
161
|
+
"print(\"Entity-Level Datasets:\")\n",
|
|
162
|
+
"print(\"-\" * 50)\n",
|
|
163
|
+
"for name, source in entity_datasets.items():\n",
|
|
164
|
+
" path = FIXTURES_DIR / name\n",
|
|
165
|
+
" if path.exists():\n",
|
|
166
|
+
" size_mb = path.stat().st_size / (1024 * 1024)\n",
|
|
167
|
+
" print(f\" [x] {name} ({size_mb:.1f} MB)\")\n",
|
|
168
|
+
" else:\n",
|
|
169
|
+
" print(f\" [ ] {name} - {source}\")\n",
|
|
170
|
+
"\n",
|
|
171
|
+
"print(\"\\nEvent-Level Datasets:\")\n",
|
|
172
|
+
"print(\"-\" * 50)\n",
|
|
173
|
+
"for name, source in event_datasets.items():\n",
|
|
174
|
+
" path = FIXTURES_DIR / name\n",
|
|
175
|
+
" if path.exists():\n",
|
|
176
|
+
" size_mb = path.stat().st_size / (1024 * 1024)\n",
|
|
177
|
+
" print(f\" [x] {name} ({size_mb:.1f} MB)\")\n",
|
|
178
|
+
" else:\n",
|
|
179
|
+
" print(f\" [ ] {name} - {source}\")"
|
|
180
|
+
]
|
|
181
|
+
},
|
|
182
|
+
{
|
|
183
|
+
"cell_type": "markdown",
|
|
184
|
+
"id": "cell-5",
|
|
185
|
+
"metadata": {
|
|
186
|
+
"papermill": {
|
|
187
|
+
"duration": 0.001471,
|
|
188
|
+
"end_time": "2026-02-02T13:00:35.334367",
|
|
189
|
+
"exception": false,
|
|
190
|
+
"start_time": "2026-02-02T13:00:35.332896",
|
|
191
|
+
"status": "completed"
|
|
192
|
+
},
|
|
193
|
+
"tags": []
|
|
194
|
+
},
|
|
195
|
+
"source": [
|
|
196
|
+
"## 0.3 Kaggle API Setup\n",
|
|
197
|
+
"\n",
|
|
198
|
+
"To download datasets from Kaggle, you need to set up API credentials:\n",
|
|
199
|
+
"\n",
|
|
200
|
+
"1. Create a Kaggle account at https://www.kaggle.com\n",
|
|
201
|
+
"2. Go to **Account Settings** → **API** → **Create New Token**\n",
|
|
202
|
+
"3. This downloads `kaggle.json` - move it to `~/.kaggle/kaggle.json`\n",
|
|
203
|
+
"4. Set permissions: `chmod 600 ~/.kaggle/kaggle.json`"
|
|
204
|
+
]
|
|
205
|
+
},
|
|
206
|
+
{
|
|
207
|
+
"cell_type": "code",
|
|
208
|
+
"execution_count": null,
|
|
209
|
+
"id": "cell-6",
|
|
210
|
+
"metadata": {
|
|
211
|
+
"execution": {
|
|
212
|
+
"iopub.execute_input": "2026-02-02T13:00:35.337782Z",
|
|
213
|
+
"iopub.status.busy": "2026-02-02T13:00:35.337684Z",
|
|
214
|
+
"iopub.status.idle": "2026-02-02T13:00:35.340233Z",
|
|
215
|
+
"shell.execute_reply": "2026-02-02T13:00:35.339622Z"
|
|
216
|
+
},
|
|
217
|
+
"papermill": {
|
|
218
|
+
"duration": 0.005057,
|
|
219
|
+
"end_time": "2026-02-02T13:00:35.340826",
|
|
220
|
+
"exception": false,
|
|
221
|
+
"start_time": "2026-02-02T13:00:35.335769",
|
|
222
|
+
"status": "completed"
|
|
223
|
+
},
|
|
224
|
+
"tags": []
|
|
225
|
+
},
|
|
226
|
+
"outputs": [],
|
|
227
|
+
"source": [
|
|
228
|
+
"# Check if Kaggle credentials exist\n",
|
|
229
|
+
"kaggle_config = Path.home() / \".kaggle\" / \"kaggle.json\"\n",
|
|
230
|
+
"\n",
|
|
231
|
+
"if kaggle_config.exists():\n",
|
|
232
|
+
" print(f\"Kaggle credentials found at {kaggle_config}\")\n",
|
|
233
|
+
"else:\n",
|
|
234
|
+
" print(\"Kaggle credentials not found.\")\n",
|
|
235
|
+
" print(\"\\nTo set up:\")\n",
|
|
236
|
+
" print(\"1. Go to https://www.kaggle.com/settings\")\n",
|
|
237
|
+
" print(\"2. Scroll to 'API' section and click 'Create New Token'\")\n",
|
|
238
|
+
" print(f\"3. Move downloaded file to {kaggle_config}\")\n",
|
|
239
|
+
" print(f\"4. Run: chmod 600 {kaggle_config}\")"
|
|
240
|
+
]
|
|
241
|
+
},
|
|
242
|
+
{
|
|
243
|
+
"cell_type": "markdown",
|
|
244
|
+
"id": "cell-7",
|
|
245
|
+
"metadata": {
|
|
246
|
+
"papermill": {
|
|
247
|
+
"duration": 0.001181,
|
|
248
|
+
"end_time": "2026-02-02T13:00:35.343523",
|
|
249
|
+
"exception": false,
|
|
250
|
+
"start_time": "2026-02-02T13:00:35.342342",
|
|
251
|
+
"status": "completed"
|
|
252
|
+
},
|
|
253
|
+
"tags": []
|
|
254
|
+
},
|
|
255
|
+
"source": [
|
|
256
|
+
"## 0.4 Download Kaggle Datasets\n",
|
|
257
|
+
"\n",
|
|
258
|
+
"Run the cells below to download each dataset. You only need to do this once.\n",
|
|
259
|
+
"\n",
|
|
260
|
+
"### Bank Customer Churn Dataset\n",
|
|
261
|
+
"Source: https://www.kaggle.com/datasets/gauravtopre/bank-customer-churn-dataset"
|
|
262
|
+
]
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
"cell_type": "code",
|
|
266
|
+
"execution_count": null,
|
|
267
|
+
"id": "cell-8",
|
|
268
|
+
"metadata": {
|
|
269
|
+
"execution": {
|
|
270
|
+
"iopub.execute_input": "2026-02-02T13:00:35.346825Z",
|
|
271
|
+
"iopub.status.busy": "2026-02-02T13:00:35.346711Z",
|
|
272
|
+
"iopub.status.idle": "2026-02-02T13:00:35.349664Z",
|
|
273
|
+
"shell.execute_reply": "2026-02-02T13:00:35.349348Z"
|
|
274
|
+
},
|
|
275
|
+
"papermill": {
|
|
276
|
+
"duration": 0.005532,
|
|
277
|
+
"end_time": "2026-02-02T13:00:35.350238",
|
|
278
|
+
"exception": false,
|
|
279
|
+
"start_time": "2026-02-02T13:00:35.344706",
|
|
280
|
+
"status": "completed"
|
|
281
|
+
},
|
|
282
|
+
"tags": []
|
|
283
|
+
},
|
|
284
|
+
"outputs": [],
|
|
285
|
+
"source": [
|
|
286
|
+
"# Download Bank Customer Churn dataset\n",
|
|
287
|
+
"import subprocess\n",
|
|
288
|
+
"import shutil\n",
|
|
289
|
+
"\n",
|
|
290
|
+
"FIXTURES_DIR.mkdir(parents=True, exist_ok=True)\n",
|
|
291
|
+
"bank_churn_path = FIXTURES_DIR / \"bank_customer_churn.csv\"\n",
|
|
292
|
+
"\n",
|
|
293
|
+
"if bank_churn_path.exists():\n",
|
|
294
|
+
" print(f\"Already exists: {bank_churn_path}\")\n",
|
|
295
|
+
"else:\n",
|
|
296
|
+
" print(\"Downloading Bank Customer Churn dataset...\")\n",
|
|
297
|
+
" try:\n",
|
|
298
|
+
" subprocess.run([\n",
|
|
299
|
+
" \"kaggle\", \"datasets\", \"download\", \"-d\", \"gauravtopre/bank-customer-churn-dataset\",\n",
|
|
300
|
+
" \"-p\", str(FIXTURES_DIR), \"--unzip\"\n",
|
|
301
|
+
" ], check=True)\n",
|
|
302
|
+
" # Rename to consistent name\n",
|
|
303
|
+
" downloaded = FIXTURES_DIR / \"Bank_Churn.csv\"\n",
|
|
304
|
+
" if downloaded.exists():\n",
|
|
305
|
+
" shutil.move(downloaded, bank_churn_path)\n",
|
|
306
|
+
" print(f\"Downloaded to: {bank_churn_path}\")\n",
|
|
307
|
+
" except FileNotFoundError:\n",
|
|
308
|
+
" print(\"Error: kaggle CLI not found. Install with: pip install kaggle\")\n",
|
|
309
|
+
" except subprocess.CalledProcessError as e:\n",
|
|
310
|
+
" print(f\"Error downloading: {e}\")"
|
|
311
|
+
]
|
|
312
|
+
},
|
|
313
|
+
{
|
|
314
|
+
"cell_type": "markdown",
|
|
315
|
+
"id": "cell-9",
|
|
316
|
+
"metadata": {
|
|
317
|
+
"papermill": {
|
|
318
|
+
"duration": 0.001147,
|
|
319
|
+
"end_time": "2026-02-02T13:00:35.352728",
|
|
320
|
+
"exception": false,
|
|
321
|
+
"start_time": "2026-02-02T13:00:35.351581",
|
|
322
|
+
"status": "completed"
|
|
323
|
+
},
|
|
324
|
+
"tags": []
|
|
325
|
+
},
|
|
326
|
+
"source": [
|
|
327
|
+
"### Netflix Customer Churn Dataset\n",
|
|
328
|
+
"Source: https://www.kaggle.com/datasets/vasifasad/netflix-customer-churn-prediction"
|
|
329
|
+
]
|
|
330
|
+
},
|
|
331
|
+
{
|
|
332
|
+
"cell_type": "code",
|
|
333
|
+
"execution_count": null,
|
|
334
|
+
"id": "cell-10",
|
|
335
|
+
"metadata": {
|
|
336
|
+
"execution": {
|
|
337
|
+
"iopub.execute_input": "2026-02-02T13:00:35.355581Z",
|
|
338
|
+
"iopub.status.busy": "2026-02-02T13:00:35.355499Z",
|
|
339
|
+
"iopub.status.idle": "2026-02-02T13:00:35.357855Z",
|
|
340
|
+
"shell.execute_reply": "2026-02-02T13:00:35.357514Z"
|
|
341
|
+
},
|
|
342
|
+
"papermill": {
|
|
343
|
+
"duration": 0.004551,
|
|
344
|
+
"end_time": "2026-02-02T13:00:35.358370",
|
|
345
|
+
"exception": false,
|
|
346
|
+
"start_time": "2026-02-02T13:00:35.353819",
|
|
347
|
+
"status": "completed"
|
|
348
|
+
},
|
|
349
|
+
"tags": []
|
|
350
|
+
},
|
|
351
|
+
"outputs": [],
|
|
352
|
+
"source": [
|
|
353
|
+
"# Download Netflix Customer Churn dataset\n",
|
|
354
|
+
"netflix_churn_path = FIXTURES_DIR / \"netflix_customer_churn.csv\"\n",
|
|
355
|
+
"\n",
|
|
356
|
+
"if netflix_churn_path.exists():\n",
|
|
357
|
+
" print(f\"Already exists: {netflix_churn_path}\")\n",
|
|
358
|
+
"else:\n",
|
|
359
|
+
" print(\"Downloading Netflix Customer Churn dataset...\")\n",
|
|
360
|
+
" try:\n",
|
|
361
|
+
" subprocess.run([\n",
|
|
362
|
+
" \"kaggle\", \"datasets\", \"download\", \"-d\", \"vasifasad/netflix-customer-churn-prediction\",\n",
|
|
363
|
+
" \"-p\", str(FIXTURES_DIR), \"--unzip\"\n",
|
|
364
|
+
" ], check=True)\n",
|
|
365
|
+
" print(f\"Downloaded to: {netflix_churn_path}\")\n",
|
|
366
|
+
" except FileNotFoundError:\n",
|
|
367
|
+
" print(\"Error: kaggle CLI not found. Install with: pip install kaggle\")\n",
|
|
368
|
+
" except subprocess.CalledProcessError as e:\n",
|
|
369
|
+
" print(f\"Error downloading: {e}\")"
|
|
370
|
+
]
|
|
371
|
+
},
|
|
372
|
+
{
|
|
373
|
+
"cell_type": "markdown",
|
|
374
|
+
"id": "cell-11",
|
|
375
|
+
"metadata": {
|
|
376
|
+
"papermill": {
|
|
377
|
+
"duration": 0.001208,
|
|
378
|
+
"end_time": "2026-02-02T13:00:35.360994",
|
|
379
|
+
"exception": false,
|
|
380
|
+
"start_time": "2026-02-02T13:00:35.359786",
|
|
381
|
+
"status": "completed"
|
|
382
|
+
},
|
|
383
|
+
"tags": []
|
|
384
|
+
},
|
|
385
|
+
"source": [
|
|
386
|
+
"## 0.5 Verify Downloads"
|
|
387
|
+
]
|
|
388
|
+
},
|
|
389
|
+
{
|
|
390
|
+
"cell_type": "code",
|
|
391
|
+
"execution_count": null,
|
|
392
|
+
"id": "cell-12",
|
|
393
|
+
"metadata": {
|
|
394
|
+
"execution": {
|
|
395
|
+
"iopub.execute_input": "2026-02-02T13:00:35.363924Z",
|
|
396
|
+
"iopub.status.busy": "2026-02-02T13:00:35.363841Z",
|
|
397
|
+
"iopub.status.idle": "2026-02-02T13:00:35.529859Z",
|
|
398
|
+
"shell.execute_reply": "2026-02-02T13:00:35.529268Z"
|
|
399
|
+
},
|
|
400
|
+
"papermill": {
|
|
401
|
+
"duration": 0.168363,
|
|
402
|
+
"end_time": "2026-02-02T13:00:35.530489",
|
|
403
|
+
"exception": false,
|
|
404
|
+
"start_time": "2026-02-02T13:00:35.362126",
|
|
405
|
+
"status": "completed"
|
|
406
|
+
},
|
|
407
|
+
"tags": []
|
|
408
|
+
},
|
|
409
|
+
"outputs": [],
|
|
410
|
+
"source": [
|
|
411
|
+
"import pandas as pd\n",
|
|
412
|
+
"\n",
|
|
413
|
+
"all_datasets = {**entity_datasets, **event_datasets}\n",
|
|
414
|
+
"\n",
|
|
415
|
+
"print(\"Dataset Summary:\")\n",
|
|
416
|
+
"print(\"=\" * 60)\n",
|
|
417
|
+
"\n",
|
|
418
|
+
"for name in all_datasets.keys():\n",
|
|
419
|
+
" path = FIXTURES_DIR / name\n",
|
|
420
|
+
" if path.exists():\n",
|
|
421
|
+
" df = pd.read_csv(path)\n",
|
|
422
|
+
" print(f\"\\n{name}:\")\n",
|
|
423
|
+
" print(f\" Rows: {len(df):,}\")\n",
|
|
424
|
+
" print(f\" Columns: {len(df.columns)}\")\n",
|
|
425
|
+
" print(f\" Columns: {', '.join(df.columns[:5])}{'...' if len(df.columns) > 5 else ''}\")\n",
|
|
426
|
+
" else:\n",
|
|
427
|
+
" print(f\"\\n{name}: Not downloaded\")"
|
|
428
|
+
]
|
|
429
|
+
},
|
|
430
|
+
{
|
|
431
|
+
"cell_type": "markdown",
|
|
432
|
+
"id": "597wey6ay6f",
|
|
433
|
+
"metadata": {
|
|
434
|
+
"papermill": {
|
|
435
|
+
"duration": 0.001513,
|
|
436
|
+
"end_time": "2026-02-02T13:00:35.534172",
|
|
437
|
+
"exception": false,
|
|
438
|
+
"start_time": "2026-02-02T13:00:35.532659",
|
|
439
|
+
"status": "completed"
|
|
440
|
+
},
|
|
441
|
+
"tags": []
|
|
442
|
+
},
|
|
443
|
+
"source": [
|
|
444
|
+
"---\n",
|
|
445
|
+
"\n",
|
|
446
|
+
"## Temporal Framework Overview\n",
|
|
447
|
+
"\n",
|
|
448
|
+
"This framework includes a **leakage-safe temporal infrastructure** for preventing data leakage in ML pipelines:\n",
|
|
449
|
+
"\n",
|
|
450
|
+
"- **Timestamp Management**: Automatic detection and handling of `feature_timestamp` and `label_timestamp`\n",
|
|
451
|
+
"- **Versioned Snapshots**: Point-in-time training snapshots with integrity hashing\n",
|
|
452
|
+
"- **Scenario Detection**: Automatic detection of production vs Kaggle-style datasets\n",
|
|
453
|
+
"- **Leakage Detection**: Multi-probe validation (correlation, separation, temporal logic)\n",
|
|
454
|
+
"\n",
|
|
455
|
+
"The temporal framework ensures that:\n",
|
|
456
|
+
"1. Features are only computed using data available at prediction time\n",
|
|
457
|
+
"2. Training data is versioned and reproducible\n",
|
|
458
|
+
"3. Temporal leakage is detected before model training\n",
|
|
459
|
+
"\n",
|
|
460
|
+
"---\n",
|
|
461
|
+
"\n",
|
|
462
|
+
"## 0.6 Using the Temporal Framework\n",
|
|
463
|
+
"\n",
|
|
464
|
+
"### Loading Data with Snapshot Manager\n",
|
|
465
|
+
"\n",
|
|
466
|
+
"For production use, load data through the snapshot system to ensure reproducibility:\n",
|
|
467
|
+
"\n",
|
|
468
|
+
"```python\n",
|
|
469
|
+
"from pathlib import Path\n",
|
|
470
|
+
"from customer_retention.stages.temporal import SnapshotManager, UnifiedDataPreparer, ScenarioDetector\n",
|
|
471
|
+
"\n",
|
|
472
|
+
"output_path = Path(\"../experiments/findings\")\n",
|
|
473
|
+
"snapshot_manager = SnapshotManager(output_path)\n",
|
|
474
|
+
"\n",
|
|
475
|
+
"snapshots = snapshot_manager.list_snapshots()\n",
|
|
476
|
+
"if snapshots:\n",
|
|
477
|
+
" latest = snapshot_manager.get_latest_snapshot()\n",
|
|
478
|
+
" df, metadata = snapshot_manager.load_snapshot(latest)\n",
|
|
479
|
+
" print(f\"Loaded {latest}: {df.shape}, created {metadata['created_at']}\")\n",
|
|
480
|
+
"```\n",
|
|
481
|
+
"\n",
|
|
482
|
+
"### Auto-Detecting Dataset Scenario\n",
|
|
483
|
+
"\n",
|
|
484
|
+
"The framework automatically detects whether your data is production (with timestamps) or Kaggle-style:\n",
|
|
485
|
+
"\n",
|
|
486
|
+
"```python\n",
|
|
487
|
+
"from customer_retention.stages.temporal import ScenarioDetector\n",
|
|
488
|
+
"\n",
|
|
489
|
+
"detector = ScenarioDetector()\n",
|
|
490
|
+
"scenario, config, discovery_result = detector.detect(df, target_column=\"churned\")\n",
|
|
491
|
+
"\n",
|
|
492
|
+
"print(f\"Scenario: {scenario}\")\n",
|
|
493
|
+
"print(f\"Feature timestamp: {config.feature_timestamp_column}\")\n",
|
|
494
|
+
"print(f\"Label timestamp: {config.label_timestamp_column}\")\n",
|
|
495
|
+
"print(f\"Strategy: {config.strategy.value}\")\n",
|
|
496
|
+
"```\n",
|
|
497
|
+
"\n",
|
|
498
|
+
"### Manual Override (When Auto-Detection Fails)\n",
|
|
499
|
+
"\n",
|
|
500
|
+
"If auto-detection picks wrong columns or an unsuitable strategy, bypass it entirely by creating `TimestampConfig` directly:\n",
|
|
501
|
+
"\n",
|
|
502
|
+
"```python\n",
|
|
503
|
+
"from customer_retention.stages.temporal import TimestampManager, TimestampConfig, TimestampStrategy\n",
|
|
504
|
+
"\n",
|
|
505
|
+
"config = TimestampConfig(\n",
|
|
506
|
+
" strategy=TimestampStrategy.PRODUCTION,\n",
|
|
507
|
+
" feature_timestamp_column=\"my_observation_date\",\n",
|
|
508
|
+
" label_timestamp_column=\"my_outcome_date\",\n",
|
|
509
|
+
" observation_window_days=90,\n",
|
|
510
|
+
")\n",
|
|
511
|
+
"manager = TimestampManager(config)\n",
|
|
512
|
+
"df_with_timestamps = manager.ensure_timestamps(df)\n",
|
|
513
|
+
"```\n",
|
|
514
|
+
"\n",
|
|
515
|
+
"**Available strategies:**\n",
|
|
516
|
+
"\n",
|
|
517
|
+
"| Strategy | When to Use |\n",
|
|
518
|
+
"|----------|-------------|\n",
|
|
519
|
+
"| `PRODUCTION` | Data has explicit timestamp columns |\n",
|
|
520
|
+
"| `DERIVED` | Timestamps can be computed from other columns (e.g., tenure) |\n",
|
|
521
|
+
"| `SYNTHETIC_FIXED` | No temporal info - use fixed date for all rows |\n",
|
|
522
|
+
"| `SYNTHETIC_RANDOM` | No temporal info - generate random dates within range |\n",
|
|
523
|
+
"| `SYNTHETIC_INDEX` | No temporal info - generate dates based on row order |\n",
|
|
524
|
+
"\n",
|
|
525
|
+
"**Force synthetic timestamps (Kaggle-style data):**\n",
|
|
526
|
+
"\n",
|
|
527
|
+
"```python\n",
|
|
528
|
+
"config = TimestampConfig(\n",
|
|
529
|
+
" strategy=TimestampStrategy.SYNTHETIC_FIXED,\n",
|
|
530
|
+
" synthetic_base_date=\"2024-01-01\",\n",
|
|
531
|
+
" observation_window_days=90,\n",
|
|
532
|
+
")\n",
|
|
533
|
+
"```\n",
|
|
534
|
+
"\n",
|
|
535
|
+
"**Derive timestamps from tenure column:**\n",
|
|
536
|
+
"\n",
|
|
537
|
+
"```python\n",
|
|
538
|
+
"config = TimestampConfig(\n",
|
|
539
|
+
" strategy=TimestampStrategy.DERIVED,\n",
|
|
540
|
+
" derivation_config={\n",
|
|
541
|
+
" \"feature_derivation\": {\n",
|
|
542
|
+
" \"formula\": \"reference_date - tenure_months\",\n",
|
|
543
|
+
" \"sources\": [\"tenure_months\"],\n",
|
|
544
|
+
" }\n",
|
|
545
|
+
" },\n",
|
|
546
|
+
" observation_window_days=90,\n",
|
|
547
|
+
")\n",
|
|
548
|
+
"```\n",
|
|
549
|
+
"\n",
|
|
550
|
+
"### Creating a Training Snapshot from Raw Data\n",
|
|
551
|
+
"\n",
|
|
552
|
+
"```python\n",
|
|
553
|
+
"from customer_retention.stages.temporal import UnifiedDataPreparer\n",
|
|
554
|
+
"from customer_retention.core.config import TemporalConfig\n",
|
|
555
|
+
"\n",
|
|
556
|
+
"config = TemporalConfig(\n",
|
|
557
|
+
" feature_timestamp_column=\"feature_timestamp\",\n",
|
|
558
|
+
" label_timestamp_column=\"label_timestamp\",\n",
|
|
559
|
+
")\n",
|
|
560
|
+
"\n",
|
|
561
|
+
"preparer = UnifiedDataPreparer(output_path=Path(\"../experiments/findings\"), timestamp_config=config)\n",
|
|
562
|
+
"\n",
|
|
563
|
+
"unified_df = preparer.prepare_from_raw(\n",
|
|
564
|
+
" df=raw_df, target_column=\"churned\", entity_column=\"customer_id\"\n",
|
|
565
|
+
")\n",
|
|
566
|
+
"\n",
|
|
567
|
+
"snapshot_df, metadata = preparer.create_training_snapshot(df=unified_df, snapshot_name=\"training\")\n",
|
|
568
|
+
"print(f\"Created snapshot: {metadata['snapshot_id']}\")\n",
|
|
569
|
+
"```\n",
|
|
570
|
+
"\n",
|
|
571
|
+
"---\n",
|
|
572
|
+
"\n",
|
|
573
|
+
"## Next Steps\n",
|
|
574
|
+
"\n",
|
|
575
|
+
"You're ready to start exploring! Continue to **01_data_discovery.ipynb**.\n",
|
|
576
|
+
"\n",
|
|
577
|
+
"**Using your own data?** Just set `DATA_PATH` to your file:\n",
|
|
578
|
+
"```python\n",
|
|
579
|
+
"DATA_PATH = \"/path/to/your/data.csv\"\n",
|
|
580
|
+
"```\n",
|
|
581
|
+
"\n",
|
|
582
|
+
"**Using sample datasets?** Choose one based on your learning goal:\n",
|
|
583
|
+
"```python\n",
|
|
584
|
+
"# Entity-level (standard flow)\n",
|
|
585
|
+
"DATA_PATH = \"../tests/fixtures/customer_retention_retail.csv\"\n",
|
|
586
|
+
"DATA_PATH = \"../tests/fixtures/bank_customer_churn.csv\"\n",
|
|
587
|
+
"DATA_PATH = \"../tests/fixtures/netflix_customer_churn.csv\"\n",
|
|
588
|
+
"\n",
|
|
589
|
+
"# Event-level (time series flow)\n",
|
|
590
|
+
"DATA_PATH = \"../tests/fixtures/customer_transactions.csv\"\n",
|
|
591
|
+
"DATA_PATH = \"../tests/fixtures/customer_emails.csv\"\n",
|
|
592
|
+
"```"
|
|
593
|
+
]
|
|
594
|
+
},
|
|
595
|
+
{
|
|
596
|
+
"cell_type": "markdown",
|
|
597
|
+
"id": "ubychqgwi4f",
|
|
598
|
+
"metadata": {
|
|
599
|
+
"papermill": {
|
|
600
|
+
"duration": 0.00124,
|
|
601
|
+
"end_time": "2026-02-02T13:00:35.536689",
|
|
602
|
+
"exception": false,
|
|
603
|
+
"start_time": "2026-02-02T13:00:35.535449",
|
|
604
|
+
"status": "completed"
|
|
605
|
+
},
|
|
606
|
+
"tags": []
|
|
607
|
+
},
|
|
608
|
+
"source": [
|
|
609
|
+
"> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
|
|
610
|
+
"> The next notebook will automatically export this notebook's HTML documentation from the saved file."
|
|
611
|
+
]
|
|
612
|
+
}
|
|
613
|
+
],
|
|
614
|
+
"metadata": {
|
|
615
|
+
"kernelspec": {
|
|
616
|
+
"display_name": "Python 3",
|
|
617
|
+
"language": "python",
|
|
618
|
+
"name": "python3"
|
|
619
|
+
},
|
|
620
|
+
"language_info": {
|
|
621
|
+
"codemirror_mode": {
|
|
622
|
+
"name": "ipython",
|
|
623
|
+
"version": 3
|
|
624
|
+
},
|
|
625
|
+
"file_extension": ".py",
|
|
626
|
+
"mimetype": "text/x-python",
|
|
627
|
+
"name": "python",
|
|
628
|
+
"nbconvert_exporter": "python",
|
|
629
|
+
"pygments_lexer": "ipython3",
|
|
630
|
+
"version": "3.12.4"
|
|
631
|
+
},
|
|
632
|
+
"papermill": {
|
|
633
|
+
"default_parameters": {},
|
|
634
|
+
"duration": 2.413615,
|
|
635
|
+
"end_time": "2026-02-02T13:00:35.753031",
|
|
636
|
+
"environment_variables": {},
|
|
637
|
+
"exception": null,
|
|
638
|
+
"input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/00_start_here.ipynb",
|
|
639
|
+
"output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/00_start_here.ipynb",
|
|
640
|
+
"parameters": {},
|
|
641
|
+
"start_time": "2026-02-02T13:00:33.339416",
|
|
642
|
+
"version": "2.6.0"
|
|
643
|
+
}
|
|
644
|
+
},
|
|
645
|
+
"nbformat": 4,
|
|
646
|
+
"nbformat_minor": 5
|
|
647
|
+
}
|