churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,178 @@
1
+ """Unified data preparation for leakage-safe ML pipelines.
2
+
3
+ This module provides the main entry point for preparing raw data for
4
+ ML training with point-in-time correctness. It combines timestamp
5
+ management, snapshot creation, and validation into a single workflow.
6
+
7
+ Example:
8
+ >>> from customer_retention.stages.temporal import (
9
+ ... ScenarioDetector, UnifiedDataPreparer
10
+ ... )
11
+ >>> from datetime import datetime
12
+ >>>
13
+ >>> # Detect scenario and get config
14
+ >>> detector = ScenarioDetector()
15
+ >>> scenario, config, _ = detector.detect(df, "churn")
16
+ >>>
17
+ >>> # Prepare data
18
+ >>> preparer = UnifiedDataPreparer(output_path, config)
19
+ >>> prepared_df = preparer.prepare_from_raw(df, "churn", "customer_id")
20
+ >>>
21
+ >>> # Create training snapshot
22
+ >>> snapshot_df, meta = preparer.create_training_snapshot(
23
+ ... prepared_df,
24
+ ... cutoff_date=datetime(2024, 6, 1)
25
+ ... )
26
+ """
27
+
28
+ from dataclasses import dataclass
29
+ from datetime import datetime
30
+ from pathlib import Path
31
+ from typing import Any, Optional
32
+
33
+ import pandas as pd
34
+
35
+ from .point_in_time_join import PointInTimeJoiner
36
+ from .snapshot_manager import SnapshotManager
37
+ from .timestamp_manager import TimestampConfig, TimestampManager
38
+
39
+
40
+ @dataclass
41
+ class PreparedData:
42
+ """Container for prepared data with validation results.
43
+
44
+ Attributes:
45
+ unified_df: The prepared DataFrame with timestamps
46
+ snapshot_metadata: Metadata about the training snapshot
47
+ timestamp_strategy: Strategy used for timestamp handling
48
+ validation_report: Report from temporal integrity validation
49
+ """
50
+ unified_df: pd.DataFrame
51
+ snapshot_metadata: dict[str, Any]
52
+ timestamp_strategy: str
53
+ validation_report: dict[str, Any]
54
+
55
+
56
+ class UnifiedDataPreparer:
57
+ """Unified entry point for preparing data with temporal correctness.
58
+
59
+ The UnifiedDataPreparer combines timestamp management, data validation,
60
+ and snapshot creation into a single workflow. It ensures all data
61
+ passes through proper point-in-time handling before being used for
62
+ training or inference.
63
+
64
+ Example:
65
+ >>> preparer = UnifiedDataPreparer(output_path, config)
66
+ >>> df = preparer.prepare_from_raw(df, "churn", "customer_id")
67
+ >>> snapshot_df, meta = preparer.create_training_snapshot(df, cutoff)
68
+ """
69
+
70
+ def __init__(self, output_path: Path, timestamp_config: TimestampConfig, storage=None):
71
+ """Initialize the UnifiedDataPreparer.
72
+
73
+ Args:
74
+ output_path: Directory for output files (unified data, snapshots)
75
+ timestamp_config: Configuration for timestamp handling
76
+ storage: Optional DeltaStorage backend
77
+ """
78
+ self.output_path = Path(output_path)
79
+ self.timestamp_manager = TimestampManager(timestamp_config)
80
+ self.snapshot_manager = SnapshotManager(output_path, storage=storage)
81
+ self.timestamp_config = timestamp_config
82
+ self.pit_joiner = PointInTimeJoiner()
83
+ self.storage = storage or _get_storage()
84
+
85
+ def prepare_from_raw(
86
+ self, df: pd.DataFrame, target_column: str, entity_column: str
87
+ ) -> pd.DataFrame:
88
+ df = self.timestamp_manager.ensure_timestamps(df)
89
+ self.timestamp_manager.validate_point_in_time(df)
90
+
91
+ df = df.rename(columns={target_column: "target", entity_column: "entity_id"})
92
+
93
+ unified_dir = self.output_path / "unified" / "unified_dataset"
94
+ unified_dir.parent.mkdir(parents=True, exist_ok=True)
95
+ if self.storage and len(df) > 0:
96
+ self.storage.write(df, str(unified_dir))
97
+ else:
98
+ parquet_path = self.output_path / "unified" / "unified_dataset.parquet"
99
+ df.to_parquet(parquet_path, index=False)
100
+
101
+ return df
102
+
103
+ def create_training_snapshot(
104
+ self, df: pd.DataFrame, cutoff_date: datetime, snapshot_name: str = "training",
105
+ timestamp_series: Optional[pd.Series] = None,
106
+ ) -> tuple[pd.DataFrame, dict[str, Any]]:
107
+ metadata = self.snapshot_manager.create_snapshot(
108
+ df=df, cutoff_date=cutoff_date, target_column="target",
109
+ snapshot_name=snapshot_name, timestamp_series=timestamp_series,
110
+ )
111
+ snapshot_df, _ = self.snapshot_manager.load_snapshot(metadata.snapshot_id)
112
+ return snapshot_df, self._metadata_to_dict(metadata)
113
+
114
+ def load_for_eda(self, snapshot_id: str) -> pd.DataFrame:
115
+ df, metadata = self.snapshot_manager.load_snapshot(snapshot_id)
116
+ print(f"Loaded snapshot: {snapshot_id}")
117
+ print(f" Rows: {metadata.row_count:,}")
118
+ print(f" Cutoff: {metadata.cutoff_date}")
119
+ print(f" Hash: {metadata.data_hash}")
120
+ return df
121
+
122
+ def load_for_inference(self, df: pd.DataFrame, as_of_date: Optional[datetime] = None) -> pd.DataFrame:
123
+ as_of_date = as_of_date or datetime.now()
124
+ df = self.timestamp_manager.ensure_timestamps(df)
125
+ df = df[df["feature_timestamp"] <= as_of_date].copy()
126
+ df["label_available_flag"] = False
127
+ df["label_timestamp"] = as_of_date
128
+ return df
129
+
130
+ def prepare_with_validation(
131
+ self, df: pd.DataFrame, target_column: str, entity_column: str, cutoff_date: datetime
132
+ ) -> PreparedData:
133
+ unified_df = self.prepare_from_raw(df, target_column, entity_column)
134
+ validation_report = self.pit_joiner.validate_temporal_integrity(unified_df)
135
+ snapshot_df, snapshot_metadata = self.create_training_snapshot(unified_df, cutoff_date)
136
+
137
+ return PreparedData(
138
+ unified_df=snapshot_df,
139
+ snapshot_metadata=snapshot_metadata,
140
+ timestamp_strategy=self.timestamp_config.strategy.value,
141
+ validation_report=validation_report,
142
+ )
143
+
144
+ def list_available_snapshots(self) -> list[str]:
145
+ return self.snapshot_manager.list_snapshots()
146
+
147
+ def get_snapshot_summary(self, snapshot_id: str) -> dict[str, Any]:
148
+ _, metadata = self.snapshot_manager.load_snapshot(snapshot_id)
149
+ return {
150
+ "snapshot_id": metadata.snapshot_id,
151
+ "version": metadata.version,
152
+ "created_at": metadata.created_at.isoformat(),
153
+ "cutoff_date": metadata.cutoff_date.isoformat(),
154
+ "row_count": metadata.row_count,
155
+ "feature_count": len(metadata.feature_columns),
156
+ "data_hash": metadata.data_hash,
157
+ }
158
+
159
+ def _metadata_to_dict(self, metadata) -> dict[str, Any]:
160
+ return {
161
+ "snapshot_id": metadata.snapshot_id,
162
+ "version": metadata.version,
163
+ "created_at": metadata.created_at.isoformat(),
164
+ "cutoff_date": metadata.cutoff_date.isoformat(),
165
+ "row_count": metadata.row_count,
166
+ "column_count": metadata.column_count,
167
+ "data_hash": metadata.data_hash,
168
+ "feature_columns": metadata.feature_columns,
169
+ "target_column": metadata.target_column,
170
+ }
171
+
172
+
173
+ def _get_storage():
174
+ try:
175
+ from customer_retention.integrations.adapters.factory import get_delta
176
+ return get_delta(force_local=True)
177
+ except ImportError:
178
+ return None
@@ -0,0 +1,134 @@
1
+ """Point-in-time correct joins for feature engineering.
2
+
3
+ This module provides utilities for joining feature tables while maintaining
4
+ temporal correctness. It ensures that features from the future are never
5
+ used to predict past events, preventing data leakage.
6
+
7
+ Key functions:
8
+ - join_features: Join feature tables with PIT correctness
9
+ - asof_join: Pandas merge_asof wrapper for temporal joins
10
+ - validate_no_future_data: Check for temporal violations
11
+ - validate_temporal_integrity: Comprehensive integrity check
12
+
13
+ Example:
14
+ >>> from customer_retention.stages.temporal import PointInTimeJoiner
15
+ >>> joiner = PointInTimeJoiner()
16
+ >>> merged = joiner.join_features(
17
+ ... base_df=customers,
18
+ ... feature_df=transactions_agg,
19
+ ... entity_key="customer_id"
20
+ ... )
21
+ """
22
+
23
+ from typing import Any
24
+
25
+ import pandas as pd
26
+
27
+
28
+ class PointInTimeJoiner:
29
+ """Utility class for point-in-time correct feature joins.
30
+
31
+ The PointInTimeJoiner ensures that when joining feature tables,
32
+ only features that were available at the time of the base record
33
+ are included. This prevents temporal leakage.
34
+
35
+ Example:
36
+ >>> joiner = PointInTimeJoiner()
37
+ >>> # Only features from before base_df's feature_timestamp are included
38
+ >>> merged = joiner.join_features(base_df, feature_df, "customer_id")
39
+ """
40
+ @staticmethod
41
+ def join_features(
42
+ base_df: pd.DataFrame, feature_df: pd.DataFrame, entity_key: str,
43
+ base_timestamp_col: str = "feature_timestamp", feature_timestamp_col: str = "feature_timestamp"
44
+ ) -> pd.DataFrame:
45
+ if base_timestamp_col not in base_df.columns:
46
+ raise ValueError(f"Base df missing timestamp column: {base_timestamp_col}")
47
+ if feature_timestamp_col not in feature_df.columns:
48
+ raise ValueError(f"Feature df missing timestamp column: {feature_timestamp_col}")
49
+
50
+ feature_df = feature_df.rename(columns={feature_timestamp_col: "_feature_ts"})
51
+ merged = base_df.merge(feature_df, on=entity_key, how="left")
52
+ valid_mask = merged["_feature_ts"] <= merged[base_timestamp_col]
53
+
54
+ merged = (
55
+ merged[valid_mask]
56
+ .sort_values([entity_key, "_feature_ts"])
57
+ .groupby(entity_key)
58
+ .last()
59
+ .reset_index()
60
+ .drop(columns=["_feature_ts"])
61
+ )
62
+ return merged
63
+
64
+ @staticmethod
65
+ def validate_no_future_data(
66
+ df: pd.DataFrame, reference_timestamp_col: str, check_columns: list[str]
67
+ ) -> dict[str, Any]:
68
+ issues: dict[str, Any] = {}
69
+ for col in check_columns:
70
+ if pd.api.types.is_datetime64_any_dtype(df[col]):
71
+ future_rows = df[df[col] > df[reference_timestamp_col]]
72
+ if len(future_rows) > 0:
73
+ issues[col] = {
74
+ "violation_count": len(future_rows),
75
+ "example_ids": future_rows.index[:5].tolist()
76
+ }
77
+ return issues
78
+
79
+ @staticmethod
80
+ def asof_join(
81
+ left_df: pd.DataFrame, right_df: pd.DataFrame, entity_key: str,
82
+ left_time_col: str, right_time_col: str, direction: str = "backward"
83
+ ) -> pd.DataFrame:
84
+ left_sorted = left_df.sort_values(left_time_col).reset_index(drop=True)
85
+ right_sorted = right_df.sort_values(right_time_col).reset_index(drop=True)
86
+
87
+ return pd.merge_asof(
88
+ left_sorted, right_sorted, left_on=left_time_col, right_on=right_time_col,
89
+ by=entity_key, direction=direction
90
+ )
91
+
92
+ @staticmethod
93
+ def create_training_labels(
94
+ df: pd.DataFrame, label_column: str, entity_key: str = "entity_id"
95
+ ) -> pd.DataFrame:
96
+ if "label_available_flag" not in df.columns:
97
+ raise ValueError("DataFrame must have label_available_flag column")
98
+
99
+ training_df = df[df["label_available_flag"] == True].copy()
100
+ if label_column not in training_df.columns:
101
+ raise ValueError(f"Label column '{label_column}' not found")
102
+
103
+ return training_df[[entity_key, "feature_timestamp", "label_timestamp", label_column]]
104
+
105
+ @staticmethod
106
+ def validate_temporal_integrity(df: pd.DataFrame) -> dict[str, Any]:
107
+ report = {"valid": True, "issues": []}
108
+
109
+ if "feature_timestamp" in df.columns and "label_timestamp" in df.columns:
110
+ violations = df[df["feature_timestamp"] > df["label_timestamp"]]
111
+ if len(violations) > 0:
112
+ report["valid"] = False
113
+ report["issues"].append({
114
+ "type": "feature_after_label",
115
+ "count": len(violations),
116
+ "message": f"{len(violations)} rows have feature_timestamp > label_timestamp"
117
+ })
118
+
119
+ datetime_cols = df.select_dtypes(include=["datetime64"]).columns
120
+ for col in datetime_cols:
121
+ if col in ["feature_timestamp", "label_timestamp"]:
122
+ continue
123
+ if "feature_timestamp" in df.columns:
124
+ future = df[df[col] > df["feature_timestamp"]]
125
+ if len(future) > 0:
126
+ report["valid"] = False
127
+ report["issues"].append({
128
+ "type": "future_data",
129
+ "column": col,
130
+ "count": len(future),
131
+ "message": f"Column {col} has {len(future)} values after feature_timestamp"
132
+ })
133
+
134
+ return report
@@ -0,0 +1,148 @@
1
+ import json
2
+ from dataclasses import dataclass, field
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+
8
+ @dataclass
9
+ class DatasetSnapshot:
10
+ dataset_name: str
11
+ snapshot_id: str
12
+ cutoff_date: datetime
13
+ source_path: str
14
+ row_count: int
15
+ created_at: datetime = field(default_factory=datetime.now)
16
+
17
+ def to_dict(self) -> dict:
18
+ return {
19
+ "dataset_name": self.dataset_name,
20
+ "snapshot_id": self.snapshot_id,
21
+ "cutoff_date": self.cutoff_date.isoformat(),
22
+ "source_path": self.source_path,
23
+ "row_count": self.row_count,
24
+ "created_at": self.created_at.isoformat(),
25
+ }
26
+
27
+ @classmethod
28
+ def from_dict(cls, data: dict) -> "DatasetSnapshot":
29
+ return cls(
30
+ dataset_name=data["dataset_name"],
31
+ snapshot_id=data["snapshot_id"],
32
+ cutoff_date=datetime.fromisoformat(data["cutoff_date"]),
33
+ source_path=data["source_path"],
34
+ row_count=data["row_count"],
35
+ created_at=datetime.fromisoformat(data["created_at"]),
36
+ )
37
+
38
+
39
+ @dataclass
40
+ class ConsistencyReport:
41
+ is_consistent: bool
42
+ reference_cutoff: Optional[datetime]
43
+ inconsistent_datasets: list[str]
44
+ message: str
45
+
46
+
47
+ class PointInTimeRegistry:
48
+ REGISTRY_FILENAME = "point_in_time_registry.json"
49
+
50
+ def __init__(self, output_dir: Path):
51
+ self.output_dir = Path(output_dir)
52
+ self.registry_path = self.output_dir / self.REGISTRY_FILENAME
53
+ self.snapshots: dict[str, DatasetSnapshot] = {}
54
+ self._load()
55
+
56
+ def _load(self) -> None:
57
+ if self.registry_path.exists():
58
+ with open(self.registry_path) as f:
59
+ data = json.load(f)
60
+ self.snapshots = {
61
+ name: DatasetSnapshot.from_dict(snap) for name, snap in data.get("snapshots", {}).items()
62
+ }
63
+
64
+ def _save(self) -> None:
65
+ self.output_dir.mkdir(parents=True, exist_ok=True)
66
+ data = {"snapshots": {name: snap.to_dict() for name, snap in self.snapshots.items()}}
67
+ with open(self.registry_path, "w") as f:
68
+ json.dump(data, f, indent=2)
69
+
70
+ def get_reference_cutoff(self) -> Optional[datetime]:
71
+ if not self.snapshots:
72
+ return None
73
+ return next(iter(self.snapshots.values())).cutoff_date
74
+
75
+ def check_consistency(self) -> ConsistencyReport:
76
+ if not self.snapshots:
77
+ return ConsistencyReport(
78
+ is_consistent=True, reference_cutoff=None, inconsistent_datasets=[], message="No datasets registered"
79
+ )
80
+
81
+ reference_cutoff = self.get_reference_cutoff()
82
+ inconsistent = [
83
+ name for name, snap in self.snapshots.items() if snap.cutoff_date.date() != reference_cutoff.date()
84
+ ]
85
+
86
+ if inconsistent:
87
+ return ConsistencyReport(
88
+ is_consistent=False,
89
+ reference_cutoff=reference_cutoff,
90
+ inconsistent_datasets=inconsistent,
91
+ message=f"Inconsistent cutoff dates detected. Reference: {reference_cutoff.date()}. "
92
+ f"Out of sync: {', '.join(inconsistent)}. Re-run exploration for these datasets.",
93
+ )
94
+
95
+ return ConsistencyReport(
96
+ is_consistent=True,
97
+ reference_cutoff=reference_cutoff,
98
+ inconsistent_datasets=[],
99
+ message=f"All {len(self.snapshots)} datasets use consistent cutoff: {reference_cutoff.date()}",
100
+ )
101
+
102
+ def validate_cutoff(self, proposed_cutoff: datetime) -> tuple[bool, str]:
103
+ reference = self.get_reference_cutoff()
104
+ if reference is None:
105
+ return True, "First dataset - cutoff date will be set as reference"
106
+
107
+ if proposed_cutoff.date() != reference.date():
108
+ return False, (
109
+ f"Cutoff date mismatch. Existing datasets use {reference.date()}. "
110
+ f"Proposed: {proposed_cutoff.date()}. Change will require re-exploration of all datasets."
111
+ )
112
+
113
+ return True, f"Cutoff date matches reference: {reference.date()}"
114
+
115
+ def register_snapshot(
116
+ self, dataset_name: str, snapshot_id: str, cutoff_date: datetime, source_path: str, row_count: int
117
+ ) -> DatasetSnapshot:
118
+ snapshot = DatasetSnapshot(
119
+ dataset_name=dataset_name,
120
+ snapshot_id=snapshot_id,
121
+ cutoff_date=cutoff_date,
122
+ source_path=source_path,
123
+ row_count=row_count,
124
+ )
125
+ self.snapshots[dataset_name] = snapshot
126
+ self._save()
127
+ return snapshot
128
+
129
+ def get_snapshot(self, dataset_name: str) -> Optional[DatasetSnapshot]:
130
+ return self.snapshots.get(dataset_name)
131
+
132
+ def list_snapshots(self) -> list[DatasetSnapshot]:
133
+ return list(self.snapshots.values())
134
+
135
+ def get_out_of_sync_datasets(self, reference_cutoff: datetime) -> list[str]:
136
+ return [name for name, snap in self.snapshots.items() if snap.cutoff_date.date() != reference_cutoff.date()]
137
+
138
+ def clear_registry(self) -> None:
139
+ self.snapshots = {}
140
+ if self.registry_path.exists():
141
+ self.registry_path.unlink()
142
+
143
+ def update_cutoff_for_all(self, new_cutoff: datetime) -> list[str]:
144
+ affected = list(self.snapshots.keys())
145
+ for name in affected:
146
+ self.snapshots[name].cutoff_date = new_cutoff
147
+ self._save()
148
+ return affected
@@ -0,0 +1,163 @@
1
+ """Automatic timestamp scenario detection for ML datasets.
2
+
3
+ This module provides high-level scenario detection that determines the
4
+ appropriate timestamp strategy for a given dataset. It wraps the
5
+ TimestampDiscoveryEngine and translates its results into actionable
6
+ configurations.
7
+
8
+ Scenarios:
9
+ - production: Dataset has explicit feature and label timestamps
10
+ - production_derived: Timestamps exist but need derivation
11
+ - partial: Only feature timestamp found, label derived from window
12
+ - derived: Timestamps can be computed from other columns (e.g., tenure)
13
+ - synthetic: No temporal information, must use synthetic timestamps
14
+
15
+ Example:
16
+ >>> from customer_retention.stages.temporal import ScenarioDetector
17
+ >>> detector = ScenarioDetector()
18
+ >>> scenario, config, discovery = detector.detect(df, "churn")
19
+ >>> print(f"Scenario: {scenario}") # e.g., "production"
20
+ >>> print(f"Strategy: {config.strategy.value}") # e.g., "production"
21
+ """
22
+
23
+ from datetime import datetime
24
+ from typing import Optional
25
+
26
+ import pandas as pd
27
+
28
+ from .timestamp_discovery import TimestampDiscoveryEngine, TimestampDiscoveryResult
29
+ from .timestamp_manager import TimestampConfig, TimestampStrategy
30
+
31
+
32
+ class ScenarioDetector:
33
+ """Detects the timestamp scenario for a dataset and provides configuration.
34
+
35
+ The ScenarioDetector analyzes a dataset to determine which timestamp
36
+ handling strategy is appropriate, returning both a human-readable scenario
37
+ name and a TimestampConfig ready for use with TimestampManager.
38
+
39
+ Example:
40
+ >>> detector = ScenarioDetector()
41
+ >>> scenario, config, result = detector.detect(df, "churn")
42
+ >>> # Use config with TimestampManager
43
+ >>> from customer_retention.stages.temporal import TimestampManager
44
+ >>> manager = TimestampManager(config)
45
+ >>> df_with_timestamps = manager.ensure_timestamps(df)
46
+ """
47
+ def __init__(self, reference_date: Optional[datetime] = None, label_window_days: int = 180):
48
+ self.label_window_days = label_window_days
49
+ self.discovery_engine = TimestampDiscoveryEngine(reference_date, label_window_days)
50
+
51
+ def detect(
52
+ self, df: pd.DataFrame, target_column: str
53
+ ) -> tuple[str, TimestampConfig, TimestampDiscoveryResult]:
54
+ discovery_result = self.discovery_engine.discover(df, target_column)
55
+
56
+ has_explicit_feature = discovery_result.feature_timestamp and not discovery_result.feature_timestamp.is_derived
57
+ has_explicit_label = discovery_result.label_timestamp and not discovery_result.label_timestamp.is_derived
58
+ label_derived_from_feature = (
59
+ discovery_result.label_timestamp and
60
+ discovery_result.label_timestamp.is_derived and
61
+ discovery_result.feature_timestamp and
62
+ discovery_result.feature_timestamp.column_name in discovery_result.label_timestamp.source_columns
63
+ )
64
+
65
+ if has_explicit_feature and has_explicit_label:
66
+ return self._configure_production_scenario(discovery_result)
67
+ elif has_explicit_feature and label_derived_from_feature:
68
+ return self._configure_partial_scenario(discovery_result)
69
+ elif discovery_result.feature_timestamp and discovery_result.label_timestamp:
70
+ return self._configure_production_scenario(discovery_result)
71
+ elif discovery_result.feature_timestamp:
72
+ return self._configure_partial_scenario(discovery_result)
73
+ elif discovery_result.derivable_options:
74
+ return self._configure_derivable_scenario(discovery_result)
75
+ return self._configure_synthetic_scenario(discovery_result)
76
+
77
+ def _configure_production_scenario(
78
+ self, result: TimestampDiscoveryResult
79
+ ) -> tuple[str, TimestampConfig, TimestampDiscoveryResult]:
80
+ feature_col = result.feature_timestamp.column_name if result.feature_timestamp else None
81
+ label_col = result.label_timestamp.column_name if result.label_timestamp else None
82
+
83
+ derivation_config = {}
84
+ if result.feature_timestamp and result.feature_timestamp.is_derived:
85
+ derivation_config["feature_derivation"] = {
86
+ "formula": result.feature_timestamp.derivation_formula,
87
+ "sources": result.feature_timestamp.source_columns,
88
+ }
89
+ if result.label_timestamp and result.label_timestamp.is_derived:
90
+ derivation_config["label_derivation"] = {
91
+ "formula": result.label_timestamp.derivation_formula,
92
+ "sources": result.label_timestamp.source_columns,
93
+ }
94
+
95
+ config = TimestampConfig(
96
+ strategy=TimestampStrategy.PRODUCTION,
97
+ feature_timestamp_column=feature_col if not (result.feature_timestamp and result.feature_timestamp.is_derived) else None,
98
+ label_timestamp_column=label_col if not (result.label_timestamp and result.label_timestamp.is_derived) else None,
99
+ observation_window_days=self.label_window_days,
100
+ derivation_config=derivation_config if derivation_config else None,
101
+ )
102
+
103
+ scenario = "production" if not derivation_config else "production_derived"
104
+ return (scenario, config, result)
105
+
106
+ def _configure_partial_scenario(
107
+ self, result: TimestampDiscoveryResult
108
+ ) -> tuple[str, TimestampConfig, TimestampDiscoveryResult]:
109
+ config = TimestampConfig(
110
+ strategy=TimestampStrategy.PRODUCTION,
111
+ feature_timestamp_column=result.feature_timestamp.column_name if result.feature_timestamp else None,
112
+ label_timestamp_column=None,
113
+ observation_window_days=self.label_window_days,
114
+ derive_label_from_feature=True,
115
+ )
116
+ return ("partial", config, result)
117
+
118
+ def _configure_derivable_scenario(
119
+ self, result: TimestampDiscoveryResult
120
+ ) -> tuple[str, TimestampConfig, TimestampDiscoveryResult]:
121
+ best_derivable = max(result.derivable_options, key=lambda c: c.confidence)
122
+
123
+ config = TimestampConfig(
124
+ strategy=TimestampStrategy.DERIVED,
125
+ derivation_config={
126
+ "feature_derivation": {
127
+ "formula": best_derivable.derivation_formula,
128
+ "sources": best_derivable.source_columns,
129
+ }
130
+ },
131
+ observation_window_days=self.label_window_days,
132
+ )
133
+ return ("derived", config, result)
134
+
135
+ def _configure_synthetic_scenario(
136
+ self, result: TimestampDiscoveryResult
137
+ ) -> tuple[str, TimestampConfig, TimestampDiscoveryResult]:
138
+ config = TimestampConfig(
139
+ strategy=TimestampStrategy.SYNTHETIC_INDEX,
140
+ observation_window_days=self.label_window_days,
141
+ synthetic_base_date="2024-01-01",
142
+ )
143
+ return ("synthetic", config, result)
144
+
145
+ def get_scenario_summary(self, scenario: str, config: TimestampConfig, result: TimestampDiscoveryResult) -> dict:
146
+ return {
147
+ "scenario": scenario,
148
+ "strategy": config.strategy.value,
149
+ "feature_timestamp_column": config.feature_timestamp_column,
150
+ "label_timestamp_column": config.label_timestamp_column,
151
+ "observation_window_days": config.observation_window_days,
152
+ "requires_derivation": config.derivation_config is not None,
153
+ "requires_synthetic": result.requires_synthetic,
154
+ "recommendation": result.recommendation,
155
+ "datetime_columns_found": result.discovery_report.get("datetime_columns_found", 0),
156
+ "derivable_timestamps_found": result.discovery_report.get("derivable_timestamps_found", 0),
157
+ }
158
+
159
+
160
+ def auto_detect_and_configure(df: pd.DataFrame, target_column: str) -> tuple[str, TimestampConfig]:
161
+ detector = ScenarioDetector()
162
+ scenario, config, _ = detector.detect(df, target_column)
163
+ return scenario, config