churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Unified data preparation for leakage-safe ML pipelines.
|
|
2
|
+
|
|
3
|
+
This module provides the main entry point for preparing raw data for
|
|
4
|
+
ML training with point-in-time correctness. It combines timestamp
|
|
5
|
+
management, snapshot creation, and validation into a single workflow.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
>>> from customer_retention.stages.temporal import (
|
|
9
|
+
... ScenarioDetector, UnifiedDataPreparer
|
|
10
|
+
... )
|
|
11
|
+
>>> from datetime import datetime
|
|
12
|
+
>>>
|
|
13
|
+
>>> # Detect scenario and get config
|
|
14
|
+
>>> detector = ScenarioDetector()
|
|
15
|
+
>>> scenario, config, _ = detector.detect(df, "churn")
|
|
16
|
+
>>>
|
|
17
|
+
>>> # Prepare data
|
|
18
|
+
>>> preparer = UnifiedDataPreparer(output_path, config)
|
|
19
|
+
>>> prepared_df = preparer.prepare_from_raw(df, "churn", "customer_id")
|
|
20
|
+
>>>
|
|
21
|
+
>>> # Create training snapshot
|
|
22
|
+
>>> snapshot_df, meta = preparer.create_training_snapshot(
|
|
23
|
+
... prepared_df,
|
|
24
|
+
... cutoff_date=datetime(2024, 6, 1)
|
|
25
|
+
... )
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from dataclasses import dataclass
|
|
29
|
+
from datetime import datetime
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Any, Optional
|
|
32
|
+
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
from .point_in_time_join import PointInTimeJoiner
|
|
36
|
+
from .snapshot_manager import SnapshotManager
|
|
37
|
+
from .timestamp_manager import TimestampConfig, TimestampManager
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class PreparedData:
|
|
42
|
+
"""Container for prepared data with validation results.
|
|
43
|
+
|
|
44
|
+
Attributes:
|
|
45
|
+
unified_df: The prepared DataFrame with timestamps
|
|
46
|
+
snapshot_metadata: Metadata about the training snapshot
|
|
47
|
+
timestamp_strategy: Strategy used for timestamp handling
|
|
48
|
+
validation_report: Report from temporal integrity validation
|
|
49
|
+
"""
|
|
50
|
+
unified_df: pd.DataFrame
|
|
51
|
+
snapshot_metadata: dict[str, Any]
|
|
52
|
+
timestamp_strategy: str
|
|
53
|
+
validation_report: dict[str, Any]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class UnifiedDataPreparer:
|
|
57
|
+
"""Unified entry point for preparing data with temporal correctness.
|
|
58
|
+
|
|
59
|
+
The UnifiedDataPreparer combines timestamp management, data validation,
|
|
60
|
+
and snapshot creation into a single workflow. It ensures all data
|
|
61
|
+
passes through proper point-in-time handling before being used for
|
|
62
|
+
training or inference.
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
>>> preparer = UnifiedDataPreparer(output_path, config)
|
|
66
|
+
>>> df = preparer.prepare_from_raw(df, "churn", "customer_id")
|
|
67
|
+
>>> snapshot_df, meta = preparer.create_training_snapshot(df, cutoff)
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(self, output_path: Path, timestamp_config: TimestampConfig, storage=None):
|
|
71
|
+
"""Initialize the UnifiedDataPreparer.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
output_path: Directory for output files (unified data, snapshots)
|
|
75
|
+
timestamp_config: Configuration for timestamp handling
|
|
76
|
+
storage: Optional DeltaStorage backend
|
|
77
|
+
"""
|
|
78
|
+
self.output_path = Path(output_path)
|
|
79
|
+
self.timestamp_manager = TimestampManager(timestamp_config)
|
|
80
|
+
self.snapshot_manager = SnapshotManager(output_path, storage=storage)
|
|
81
|
+
self.timestamp_config = timestamp_config
|
|
82
|
+
self.pit_joiner = PointInTimeJoiner()
|
|
83
|
+
self.storage = storage or _get_storage()
|
|
84
|
+
|
|
85
|
+
def prepare_from_raw(
|
|
86
|
+
self, df: pd.DataFrame, target_column: str, entity_column: str
|
|
87
|
+
) -> pd.DataFrame:
|
|
88
|
+
df = self.timestamp_manager.ensure_timestamps(df)
|
|
89
|
+
self.timestamp_manager.validate_point_in_time(df)
|
|
90
|
+
|
|
91
|
+
df = df.rename(columns={target_column: "target", entity_column: "entity_id"})
|
|
92
|
+
|
|
93
|
+
unified_dir = self.output_path / "unified" / "unified_dataset"
|
|
94
|
+
unified_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
if self.storage and len(df) > 0:
|
|
96
|
+
self.storage.write(df, str(unified_dir))
|
|
97
|
+
else:
|
|
98
|
+
parquet_path = self.output_path / "unified" / "unified_dataset.parquet"
|
|
99
|
+
df.to_parquet(parquet_path, index=False)
|
|
100
|
+
|
|
101
|
+
return df
|
|
102
|
+
|
|
103
|
+
def create_training_snapshot(
|
|
104
|
+
self, df: pd.DataFrame, cutoff_date: datetime, snapshot_name: str = "training",
|
|
105
|
+
timestamp_series: Optional[pd.Series] = None,
|
|
106
|
+
) -> tuple[pd.DataFrame, dict[str, Any]]:
|
|
107
|
+
metadata = self.snapshot_manager.create_snapshot(
|
|
108
|
+
df=df, cutoff_date=cutoff_date, target_column="target",
|
|
109
|
+
snapshot_name=snapshot_name, timestamp_series=timestamp_series,
|
|
110
|
+
)
|
|
111
|
+
snapshot_df, _ = self.snapshot_manager.load_snapshot(metadata.snapshot_id)
|
|
112
|
+
return snapshot_df, self._metadata_to_dict(metadata)
|
|
113
|
+
|
|
114
|
+
def load_for_eda(self, snapshot_id: str) -> pd.DataFrame:
|
|
115
|
+
df, metadata = self.snapshot_manager.load_snapshot(snapshot_id)
|
|
116
|
+
print(f"Loaded snapshot: {snapshot_id}")
|
|
117
|
+
print(f" Rows: {metadata.row_count:,}")
|
|
118
|
+
print(f" Cutoff: {metadata.cutoff_date}")
|
|
119
|
+
print(f" Hash: {metadata.data_hash}")
|
|
120
|
+
return df
|
|
121
|
+
|
|
122
|
+
def load_for_inference(self, df: pd.DataFrame, as_of_date: Optional[datetime] = None) -> pd.DataFrame:
|
|
123
|
+
as_of_date = as_of_date or datetime.now()
|
|
124
|
+
df = self.timestamp_manager.ensure_timestamps(df)
|
|
125
|
+
df = df[df["feature_timestamp"] <= as_of_date].copy()
|
|
126
|
+
df["label_available_flag"] = False
|
|
127
|
+
df["label_timestamp"] = as_of_date
|
|
128
|
+
return df
|
|
129
|
+
|
|
130
|
+
def prepare_with_validation(
|
|
131
|
+
self, df: pd.DataFrame, target_column: str, entity_column: str, cutoff_date: datetime
|
|
132
|
+
) -> PreparedData:
|
|
133
|
+
unified_df = self.prepare_from_raw(df, target_column, entity_column)
|
|
134
|
+
validation_report = self.pit_joiner.validate_temporal_integrity(unified_df)
|
|
135
|
+
snapshot_df, snapshot_metadata = self.create_training_snapshot(unified_df, cutoff_date)
|
|
136
|
+
|
|
137
|
+
return PreparedData(
|
|
138
|
+
unified_df=snapshot_df,
|
|
139
|
+
snapshot_metadata=snapshot_metadata,
|
|
140
|
+
timestamp_strategy=self.timestamp_config.strategy.value,
|
|
141
|
+
validation_report=validation_report,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def list_available_snapshots(self) -> list[str]:
|
|
145
|
+
return self.snapshot_manager.list_snapshots()
|
|
146
|
+
|
|
147
|
+
def get_snapshot_summary(self, snapshot_id: str) -> dict[str, Any]:
|
|
148
|
+
_, metadata = self.snapshot_manager.load_snapshot(snapshot_id)
|
|
149
|
+
return {
|
|
150
|
+
"snapshot_id": metadata.snapshot_id,
|
|
151
|
+
"version": metadata.version,
|
|
152
|
+
"created_at": metadata.created_at.isoformat(),
|
|
153
|
+
"cutoff_date": metadata.cutoff_date.isoformat(),
|
|
154
|
+
"row_count": metadata.row_count,
|
|
155
|
+
"feature_count": len(metadata.feature_columns),
|
|
156
|
+
"data_hash": metadata.data_hash,
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
def _metadata_to_dict(self, metadata) -> dict[str, Any]:
|
|
160
|
+
return {
|
|
161
|
+
"snapshot_id": metadata.snapshot_id,
|
|
162
|
+
"version": metadata.version,
|
|
163
|
+
"created_at": metadata.created_at.isoformat(),
|
|
164
|
+
"cutoff_date": metadata.cutoff_date.isoformat(),
|
|
165
|
+
"row_count": metadata.row_count,
|
|
166
|
+
"column_count": metadata.column_count,
|
|
167
|
+
"data_hash": metadata.data_hash,
|
|
168
|
+
"feature_columns": metadata.feature_columns,
|
|
169
|
+
"target_column": metadata.target_column,
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _get_storage():
|
|
174
|
+
try:
|
|
175
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
176
|
+
return get_delta(force_local=True)
|
|
177
|
+
except ImportError:
|
|
178
|
+
return None
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Point-in-time correct joins for feature engineering.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for joining feature tables while maintaining
|
|
4
|
+
temporal correctness. It ensures that features from the future are never
|
|
5
|
+
used to predict past events, preventing data leakage.
|
|
6
|
+
|
|
7
|
+
Key functions:
|
|
8
|
+
- join_features: Join feature tables with PIT correctness
|
|
9
|
+
- asof_join: Pandas merge_asof wrapper for temporal joins
|
|
10
|
+
- validate_no_future_data: Check for temporal violations
|
|
11
|
+
- validate_temporal_integrity: Comprehensive integrity check
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
>>> from customer_retention.stages.temporal import PointInTimeJoiner
|
|
15
|
+
>>> joiner = PointInTimeJoiner()
|
|
16
|
+
>>> merged = joiner.join_features(
|
|
17
|
+
... base_df=customers,
|
|
18
|
+
... feature_df=transactions_agg,
|
|
19
|
+
... entity_key="customer_id"
|
|
20
|
+
... )
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
import pandas as pd
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PointInTimeJoiner:
|
|
29
|
+
"""Utility class for point-in-time correct feature joins.
|
|
30
|
+
|
|
31
|
+
The PointInTimeJoiner ensures that when joining feature tables,
|
|
32
|
+
only features that were available at the time of the base record
|
|
33
|
+
are included. This prevents temporal leakage.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> joiner = PointInTimeJoiner()
|
|
37
|
+
>>> # Only features from before base_df's feature_timestamp are included
|
|
38
|
+
>>> merged = joiner.join_features(base_df, feature_df, "customer_id")
|
|
39
|
+
"""
|
|
40
|
+
@staticmethod
|
|
41
|
+
def join_features(
|
|
42
|
+
base_df: pd.DataFrame, feature_df: pd.DataFrame, entity_key: str,
|
|
43
|
+
base_timestamp_col: str = "feature_timestamp", feature_timestamp_col: str = "feature_timestamp"
|
|
44
|
+
) -> pd.DataFrame:
|
|
45
|
+
if base_timestamp_col not in base_df.columns:
|
|
46
|
+
raise ValueError(f"Base df missing timestamp column: {base_timestamp_col}")
|
|
47
|
+
if feature_timestamp_col not in feature_df.columns:
|
|
48
|
+
raise ValueError(f"Feature df missing timestamp column: {feature_timestamp_col}")
|
|
49
|
+
|
|
50
|
+
feature_df = feature_df.rename(columns={feature_timestamp_col: "_feature_ts"})
|
|
51
|
+
merged = base_df.merge(feature_df, on=entity_key, how="left")
|
|
52
|
+
valid_mask = merged["_feature_ts"] <= merged[base_timestamp_col]
|
|
53
|
+
|
|
54
|
+
merged = (
|
|
55
|
+
merged[valid_mask]
|
|
56
|
+
.sort_values([entity_key, "_feature_ts"])
|
|
57
|
+
.groupby(entity_key)
|
|
58
|
+
.last()
|
|
59
|
+
.reset_index()
|
|
60
|
+
.drop(columns=["_feature_ts"])
|
|
61
|
+
)
|
|
62
|
+
return merged
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def validate_no_future_data(
|
|
66
|
+
df: pd.DataFrame, reference_timestamp_col: str, check_columns: list[str]
|
|
67
|
+
) -> dict[str, Any]:
|
|
68
|
+
issues: dict[str, Any] = {}
|
|
69
|
+
for col in check_columns:
|
|
70
|
+
if pd.api.types.is_datetime64_any_dtype(df[col]):
|
|
71
|
+
future_rows = df[df[col] > df[reference_timestamp_col]]
|
|
72
|
+
if len(future_rows) > 0:
|
|
73
|
+
issues[col] = {
|
|
74
|
+
"violation_count": len(future_rows),
|
|
75
|
+
"example_ids": future_rows.index[:5].tolist()
|
|
76
|
+
}
|
|
77
|
+
return issues
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def asof_join(
|
|
81
|
+
left_df: pd.DataFrame, right_df: pd.DataFrame, entity_key: str,
|
|
82
|
+
left_time_col: str, right_time_col: str, direction: str = "backward"
|
|
83
|
+
) -> pd.DataFrame:
|
|
84
|
+
left_sorted = left_df.sort_values(left_time_col).reset_index(drop=True)
|
|
85
|
+
right_sorted = right_df.sort_values(right_time_col).reset_index(drop=True)
|
|
86
|
+
|
|
87
|
+
return pd.merge_asof(
|
|
88
|
+
left_sorted, right_sorted, left_on=left_time_col, right_on=right_time_col,
|
|
89
|
+
by=entity_key, direction=direction
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
@staticmethod
|
|
93
|
+
def create_training_labels(
|
|
94
|
+
df: pd.DataFrame, label_column: str, entity_key: str = "entity_id"
|
|
95
|
+
) -> pd.DataFrame:
|
|
96
|
+
if "label_available_flag" not in df.columns:
|
|
97
|
+
raise ValueError("DataFrame must have label_available_flag column")
|
|
98
|
+
|
|
99
|
+
training_df = df[df["label_available_flag"] == True].copy()
|
|
100
|
+
if label_column not in training_df.columns:
|
|
101
|
+
raise ValueError(f"Label column '{label_column}' not found")
|
|
102
|
+
|
|
103
|
+
return training_df[[entity_key, "feature_timestamp", "label_timestamp", label_column]]
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def validate_temporal_integrity(df: pd.DataFrame) -> dict[str, Any]:
|
|
107
|
+
report = {"valid": True, "issues": []}
|
|
108
|
+
|
|
109
|
+
if "feature_timestamp" in df.columns and "label_timestamp" in df.columns:
|
|
110
|
+
violations = df[df["feature_timestamp"] > df["label_timestamp"]]
|
|
111
|
+
if len(violations) > 0:
|
|
112
|
+
report["valid"] = False
|
|
113
|
+
report["issues"].append({
|
|
114
|
+
"type": "feature_after_label",
|
|
115
|
+
"count": len(violations),
|
|
116
|
+
"message": f"{len(violations)} rows have feature_timestamp > label_timestamp"
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
datetime_cols = df.select_dtypes(include=["datetime64"]).columns
|
|
120
|
+
for col in datetime_cols:
|
|
121
|
+
if col in ["feature_timestamp", "label_timestamp"]:
|
|
122
|
+
continue
|
|
123
|
+
if "feature_timestamp" in df.columns:
|
|
124
|
+
future = df[df[col] > df["feature_timestamp"]]
|
|
125
|
+
if len(future) > 0:
|
|
126
|
+
report["valid"] = False
|
|
127
|
+
report["issues"].append({
|
|
128
|
+
"type": "future_data",
|
|
129
|
+
"column": col,
|
|
130
|
+
"count": len(future),
|
|
131
|
+
"message": f"Column {col} has {len(future)} values after feature_timestamp"
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
return report
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class DatasetSnapshot:
|
|
10
|
+
dataset_name: str
|
|
11
|
+
snapshot_id: str
|
|
12
|
+
cutoff_date: datetime
|
|
13
|
+
source_path: str
|
|
14
|
+
row_count: int
|
|
15
|
+
created_at: datetime = field(default_factory=datetime.now)
|
|
16
|
+
|
|
17
|
+
def to_dict(self) -> dict:
|
|
18
|
+
return {
|
|
19
|
+
"dataset_name": self.dataset_name,
|
|
20
|
+
"snapshot_id": self.snapshot_id,
|
|
21
|
+
"cutoff_date": self.cutoff_date.isoformat(),
|
|
22
|
+
"source_path": self.source_path,
|
|
23
|
+
"row_count": self.row_count,
|
|
24
|
+
"created_at": self.created_at.isoformat(),
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def from_dict(cls, data: dict) -> "DatasetSnapshot":
|
|
29
|
+
return cls(
|
|
30
|
+
dataset_name=data["dataset_name"],
|
|
31
|
+
snapshot_id=data["snapshot_id"],
|
|
32
|
+
cutoff_date=datetime.fromisoformat(data["cutoff_date"]),
|
|
33
|
+
source_path=data["source_path"],
|
|
34
|
+
row_count=data["row_count"],
|
|
35
|
+
created_at=datetime.fromisoformat(data["created_at"]),
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class ConsistencyReport:
|
|
41
|
+
is_consistent: bool
|
|
42
|
+
reference_cutoff: Optional[datetime]
|
|
43
|
+
inconsistent_datasets: list[str]
|
|
44
|
+
message: str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class PointInTimeRegistry:
|
|
48
|
+
REGISTRY_FILENAME = "point_in_time_registry.json"
|
|
49
|
+
|
|
50
|
+
def __init__(self, output_dir: Path):
|
|
51
|
+
self.output_dir = Path(output_dir)
|
|
52
|
+
self.registry_path = self.output_dir / self.REGISTRY_FILENAME
|
|
53
|
+
self.snapshots: dict[str, DatasetSnapshot] = {}
|
|
54
|
+
self._load()
|
|
55
|
+
|
|
56
|
+
def _load(self) -> None:
|
|
57
|
+
if self.registry_path.exists():
|
|
58
|
+
with open(self.registry_path) as f:
|
|
59
|
+
data = json.load(f)
|
|
60
|
+
self.snapshots = {
|
|
61
|
+
name: DatasetSnapshot.from_dict(snap) for name, snap in data.get("snapshots", {}).items()
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
def _save(self) -> None:
|
|
65
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
data = {"snapshots": {name: snap.to_dict() for name, snap in self.snapshots.items()}}
|
|
67
|
+
with open(self.registry_path, "w") as f:
|
|
68
|
+
json.dump(data, f, indent=2)
|
|
69
|
+
|
|
70
|
+
def get_reference_cutoff(self) -> Optional[datetime]:
|
|
71
|
+
if not self.snapshots:
|
|
72
|
+
return None
|
|
73
|
+
return next(iter(self.snapshots.values())).cutoff_date
|
|
74
|
+
|
|
75
|
+
def check_consistency(self) -> ConsistencyReport:
|
|
76
|
+
if not self.snapshots:
|
|
77
|
+
return ConsistencyReport(
|
|
78
|
+
is_consistent=True, reference_cutoff=None, inconsistent_datasets=[], message="No datasets registered"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
reference_cutoff = self.get_reference_cutoff()
|
|
82
|
+
inconsistent = [
|
|
83
|
+
name for name, snap in self.snapshots.items() if snap.cutoff_date.date() != reference_cutoff.date()
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
if inconsistent:
|
|
87
|
+
return ConsistencyReport(
|
|
88
|
+
is_consistent=False,
|
|
89
|
+
reference_cutoff=reference_cutoff,
|
|
90
|
+
inconsistent_datasets=inconsistent,
|
|
91
|
+
message=f"Inconsistent cutoff dates detected. Reference: {reference_cutoff.date()}. "
|
|
92
|
+
f"Out of sync: {', '.join(inconsistent)}. Re-run exploration for these datasets.",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return ConsistencyReport(
|
|
96
|
+
is_consistent=True,
|
|
97
|
+
reference_cutoff=reference_cutoff,
|
|
98
|
+
inconsistent_datasets=[],
|
|
99
|
+
message=f"All {len(self.snapshots)} datasets use consistent cutoff: {reference_cutoff.date()}",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def validate_cutoff(self, proposed_cutoff: datetime) -> tuple[bool, str]:
|
|
103
|
+
reference = self.get_reference_cutoff()
|
|
104
|
+
if reference is None:
|
|
105
|
+
return True, "First dataset - cutoff date will be set as reference"
|
|
106
|
+
|
|
107
|
+
if proposed_cutoff.date() != reference.date():
|
|
108
|
+
return False, (
|
|
109
|
+
f"Cutoff date mismatch. Existing datasets use {reference.date()}. "
|
|
110
|
+
f"Proposed: {proposed_cutoff.date()}. Change will require re-exploration of all datasets."
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
return True, f"Cutoff date matches reference: {reference.date()}"
|
|
114
|
+
|
|
115
|
+
def register_snapshot(
|
|
116
|
+
self, dataset_name: str, snapshot_id: str, cutoff_date: datetime, source_path: str, row_count: int
|
|
117
|
+
) -> DatasetSnapshot:
|
|
118
|
+
snapshot = DatasetSnapshot(
|
|
119
|
+
dataset_name=dataset_name,
|
|
120
|
+
snapshot_id=snapshot_id,
|
|
121
|
+
cutoff_date=cutoff_date,
|
|
122
|
+
source_path=source_path,
|
|
123
|
+
row_count=row_count,
|
|
124
|
+
)
|
|
125
|
+
self.snapshots[dataset_name] = snapshot
|
|
126
|
+
self._save()
|
|
127
|
+
return snapshot
|
|
128
|
+
|
|
129
|
+
def get_snapshot(self, dataset_name: str) -> Optional[DatasetSnapshot]:
|
|
130
|
+
return self.snapshots.get(dataset_name)
|
|
131
|
+
|
|
132
|
+
def list_snapshots(self) -> list[DatasetSnapshot]:
|
|
133
|
+
return list(self.snapshots.values())
|
|
134
|
+
|
|
135
|
+
def get_out_of_sync_datasets(self, reference_cutoff: datetime) -> list[str]:
|
|
136
|
+
return [name for name, snap in self.snapshots.items() if snap.cutoff_date.date() != reference_cutoff.date()]
|
|
137
|
+
|
|
138
|
+
def clear_registry(self) -> None:
|
|
139
|
+
self.snapshots = {}
|
|
140
|
+
if self.registry_path.exists():
|
|
141
|
+
self.registry_path.unlink()
|
|
142
|
+
|
|
143
|
+
def update_cutoff_for_all(self, new_cutoff: datetime) -> list[str]:
|
|
144
|
+
affected = list(self.snapshots.keys())
|
|
145
|
+
for name in affected:
|
|
146
|
+
self.snapshots[name].cutoff_date = new_cutoff
|
|
147
|
+
self._save()
|
|
148
|
+
return affected
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""Automatic timestamp scenario detection for ML datasets.
|
|
2
|
+
|
|
3
|
+
This module provides high-level scenario detection that determines the
|
|
4
|
+
appropriate timestamp strategy for a given dataset. It wraps the
|
|
5
|
+
TimestampDiscoveryEngine and translates its results into actionable
|
|
6
|
+
configurations.
|
|
7
|
+
|
|
8
|
+
Scenarios:
|
|
9
|
+
- production: Dataset has explicit feature and label timestamps
|
|
10
|
+
- production_derived: Timestamps exist but need derivation
|
|
11
|
+
- partial: Only feature timestamp found, label derived from window
|
|
12
|
+
- derived: Timestamps can be computed from other columns (e.g., tenure)
|
|
13
|
+
- synthetic: No temporal information, must use synthetic timestamps
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
>>> from customer_retention.stages.temporal import ScenarioDetector
|
|
17
|
+
>>> detector = ScenarioDetector()
|
|
18
|
+
>>> scenario, config, discovery = detector.detect(df, "churn")
|
|
19
|
+
>>> print(f"Scenario: {scenario}") # e.g., "production"
|
|
20
|
+
>>> print(f"Strategy: {config.strategy.value}") # e.g., "production"
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from datetime import datetime
|
|
24
|
+
from typing import Optional
|
|
25
|
+
|
|
26
|
+
import pandas as pd
|
|
27
|
+
|
|
28
|
+
from .timestamp_discovery import TimestampDiscoveryEngine, TimestampDiscoveryResult
|
|
29
|
+
from .timestamp_manager import TimestampConfig, TimestampStrategy
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ScenarioDetector:
|
|
33
|
+
"""Detects the timestamp scenario for a dataset and provides configuration.
|
|
34
|
+
|
|
35
|
+
The ScenarioDetector analyzes a dataset to determine which timestamp
|
|
36
|
+
handling strategy is appropriate, returning both a human-readable scenario
|
|
37
|
+
name and a TimestampConfig ready for use with TimestampManager.
|
|
38
|
+
|
|
39
|
+
Example:
|
|
40
|
+
>>> detector = ScenarioDetector()
|
|
41
|
+
>>> scenario, config, result = detector.detect(df, "churn")
|
|
42
|
+
>>> # Use config with TimestampManager
|
|
43
|
+
>>> from customer_retention.stages.temporal import TimestampManager
|
|
44
|
+
>>> manager = TimestampManager(config)
|
|
45
|
+
>>> df_with_timestamps = manager.ensure_timestamps(df)
|
|
46
|
+
"""
|
|
47
|
+
def __init__(self, reference_date: Optional[datetime] = None, label_window_days: int = 180):
|
|
48
|
+
self.label_window_days = label_window_days
|
|
49
|
+
self.discovery_engine = TimestampDiscoveryEngine(reference_date, label_window_days)
|
|
50
|
+
|
|
51
|
+
def detect(
|
|
52
|
+
self, df: pd.DataFrame, target_column: str
|
|
53
|
+
) -> tuple[str, TimestampConfig, TimestampDiscoveryResult]:
|
|
54
|
+
discovery_result = self.discovery_engine.discover(df, target_column)
|
|
55
|
+
|
|
56
|
+
has_explicit_feature = discovery_result.feature_timestamp and not discovery_result.feature_timestamp.is_derived
|
|
57
|
+
has_explicit_label = discovery_result.label_timestamp and not discovery_result.label_timestamp.is_derived
|
|
58
|
+
label_derived_from_feature = (
|
|
59
|
+
discovery_result.label_timestamp and
|
|
60
|
+
discovery_result.label_timestamp.is_derived and
|
|
61
|
+
discovery_result.feature_timestamp and
|
|
62
|
+
discovery_result.feature_timestamp.column_name in discovery_result.label_timestamp.source_columns
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if has_explicit_feature and has_explicit_label:
|
|
66
|
+
return self._configure_production_scenario(discovery_result)
|
|
67
|
+
elif has_explicit_feature and label_derived_from_feature:
|
|
68
|
+
return self._configure_partial_scenario(discovery_result)
|
|
69
|
+
elif discovery_result.feature_timestamp and discovery_result.label_timestamp:
|
|
70
|
+
return self._configure_production_scenario(discovery_result)
|
|
71
|
+
elif discovery_result.feature_timestamp:
|
|
72
|
+
return self._configure_partial_scenario(discovery_result)
|
|
73
|
+
elif discovery_result.derivable_options:
|
|
74
|
+
return self._configure_derivable_scenario(discovery_result)
|
|
75
|
+
return self._configure_synthetic_scenario(discovery_result)
|
|
76
|
+
|
|
77
|
+
def _configure_production_scenario(
|
|
78
|
+
self, result: TimestampDiscoveryResult
|
|
79
|
+
) -> tuple[str, TimestampConfig, TimestampDiscoveryResult]:
|
|
80
|
+
feature_col = result.feature_timestamp.column_name if result.feature_timestamp else None
|
|
81
|
+
label_col = result.label_timestamp.column_name if result.label_timestamp else None
|
|
82
|
+
|
|
83
|
+
derivation_config = {}
|
|
84
|
+
if result.feature_timestamp and result.feature_timestamp.is_derived:
|
|
85
|
+
derivation_config["feature_derivation"] = {
|
|
86
|
+
"formula": result.feature_timestamp.derivation_formula,
|
|
87
|
+
"sources": result.feature_timestamp.source_columns,
|
|
88
|
+
}
|
|
89
|
+
if result.label_timestamp and result.label_timestamp.is_derived:
|
|
90
|
+
derivation_config["label_derivation"] = {
|
|
91
|
+
"formula": result.label_timestamp.derivation_formula,
|
|
92
|
+
"sources": result.label_timestamp.source_columns,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
config = TimestampConfig(
|
|
96
|
+
strategy=TimestampStrategy.PRODUCTION,
|
|
97
|
+
feature_timestamp_column=feature_col if not (result.feature_timestamp and result.feature_timestamp.is_derived) else None,
|
|
98
|
+
label_timestamp_column=label_col if not (result.label_timestamp and result.label_timestamp.is_derived) else None,
|
|
99
|
+
observation_window_days=self.label_window_days,
|
|
100
|
+
derivation_config=derivation_config if derivation_config else None,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
scenario = "production" if not derivation_config else "production_derived"
|
|
104
|
+
return (scenario, config, result)
|
|
105
|
+
|
|
106
|
+
def _configure_partial_scenario(
|
|
107
|
+
self, result: TimestampDiscoveryResult
|
|
108
|
+
) -> tuple[str, TimestampConfig, TimestampDiscoveryResult]:
|
|
109
|
+
config = TimestampConfig(
|
|
110
|
+
strategy=TimestampStrategy.PRODUCTION,
|
|
111
|
+
feature_timestamp_column=result.feature_timestamp.column_name if result.feature_timestamp else None,
|
|
112
|
+
label_timestamp_column=None,
|
|
113
|
+
observation_window_days=self.label_window_days,
|
|
114
|
+
derive_label_from_feature=True,
|
|
115
|
+
)
|
|
116
|
+
return ("partial", config, result)
|
|
117
|
+
|
|
118
|
+
def _configure_derivable_scenario(
|
|
119
|
+
self, result: TimestampDiscoveryResult
|
|
120
|
+
) -> tuple[str, TimestampConfig, TimestampDiscoveryResult]:
|
|
121
|
+
best_derivable = max(result.derivable_options, key=lambda c: c.confidence)
|
|
122
|
+
|
|
123
|
+
config = TimestampConfig(
|
|
124
|
+
strategy=TimestampStrategy.DERIVED,
|
|
125
|
+
derivation_config={
|
|
126
|
+
"feature_derivation": {
|
|
127
|
+
"formula": best_derivable.derivation_formula,
|
|
128
|
+
"sources": best_derivable.source_columns,
|
|
129
|
+
}
|
|
130
|
+
},
|
|
131
|
+
observation_window_days=self.label_window_days,
|
|
132
|
+
)
|
|
133
|
+
return ("derived", config, result)
|
|
134
|
+
|
|
135
|
+
def _configure_synthetic_scenario(
|
|
136
|
+
self, result: TimestampDiscoveryResult
|
|
137
|
+
) -> tuple[str, TimestampConfig, TimestampDiscoveryResult]:
|
|
138
|
+
config = TimestampConfig(
|
|
139
|
+
strategy=TimestampStrategy.SYNTHETIC_INDEX,
|
|
140
|
+
observation_window_days=self.label_window_days,
|
|
141
|
+
synthetic_base_date="2024-01-01",
|
|
142
|
+
)
|
|
143
|
+
return ("synthetic", config, result)
|
|
144
|
+
|
|
145
|
+
def get_scenario_summary(self, scenario: str, config: TimestampConfig, result: TimestampDiscoveryResult) -> dict:
|
|
146
|
+
return {
|
|
147
|
+
"scenario": scenario,
|
|
148
|
+
"strategy": config.strategy.value,
|
|
149
|
+
"feature_timestamp_column": config.feature_timestamp_column,
|
|
150
|
+
"label_timestamp_column": config.label_timestamp_column,
|
|
151
|
+
"observation_window_days": config.observation_window_days,
|
|
152
|
+
"requires_derivation": config.derivation_config is not None,
|
|
153
|
+
"requires_synthetic": result.requires_synthetic,
|
|
154
|
+
"recommendation": result.recommendation,
|
|
155
|
+
"datetime_columns_found": result.discovery_report.get("datetime_columns_found", 0),
|
|
156
|
+
"derivable_timestamps_found": result.discovery_report.get("derivable_timestamps_found", 0),
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def auto_detect_and_configure(df: pd.DataFrame, target_column: str) -> tuple[str, TimestampConfig]:
|
|
161
|
+
detector = ScenarioDetector()
|
|
162
|
+
scenario, config, _ = detector.detect(df, target_column)
|
|
163
|
+
return scenario, config
|