churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,166 @@
1
+ """Temporal framework for leakage-safe ML pipelines.
2
+
3
+ This module provides infrastructure for preventing data leakage in ML training
4
+ by enforcing point-in-time (PIT) correctness throughout the data preparation
5
+ and training pipeline.
6
+
7
+ Core Components:
8
+ - TimestampManager: Ensures proper timestamp columns exist
9
+ - TimestampDiscoveryEngine: Auto-detects timestamps in datasets
10
+ - ScenarioDetector: Determines appropriate timestamp strategy
11
+ - UnifiedDataPreparer: Single entry point for data preparation
12
+ - SnapshotManager: Versioned training snapshots with integrity hashing
13
+ - DataAccessGuard: Context-based data access control
14
+
15
+ Quick Start:
16
+ >>> from customer_retention.stages.temporal import (
17
+ ... ScenarioDetector, UnifiedDataPreparer
18
+ ... )
19
+ >>> from datetime import datetime
20
+ >>>
21
+ >>> # Detect scenario and prepare data
22
+ >>> detector = ScenarioDetector()
23
+ >>> scenario, config, _ = detector.detect(df, target_column="churn")
24
+ >>>
25
+ >>> preparer = UnifiedDataPreparer(output_path="./output", timestamp_config=config)
26
+ >>> prepared_df = preparer.prepare_from_raw(df, "churn", "customer_id")
27
+ >>>
28
+ >>> # Create versioned training snapshot
29
+ >>> snapshot_df, meta = preparer.create_training_snapshot(
30
+ ... prepared_df,
31
+ ... cutoff_date=datetime(2024, 6, 1)
32
+ ... )
33
+ >>> print(f"Snapshot: {meta['snapshot_id']}, hash: {meta['data_hash']}")
34
+
35
+ Timestamp Scenarios:
36
+ - production: Dataset has explicit feature and label timestamps
37
+ - partial: Only feature timestamp found, label derived from window
38
+ - derived: Timestamps can be computed from other columns
39
+ - synthetic: No temporal information, must use synthetic timestamps
40
+ """
41
+
42
+ # Import canonical temporal metadata columns from central location
43
+ from customer_retention.core.utils.leakage import TEMPORAL_METADATA_COLUMNS
44
+
45
+ from .access_guard import AccessContext, DataAccessGuard
46
+ from .cutoff_analyzer import CutoffAnalysis, CutoffAnalyzer, SplitResult
47
+ from .data_preparer import PreparedData, UnifiedDataPreparer
48
+ from .point_in_time_join import PointInTimeJoiner
49
+ from .point_in_time_registry import ConsistencyReport, DatasetSnapshot, PointInTimeRegistry
50
+ from .scenario_detector import ScenarioDetector
51
+ from .snapshot_manager import SnapshotManager, SnapshotMetadata
52
+ from .synthetic_coordinator import SyntheticCoordinationParams, SyntheticTimestampCoordinator
53
+ from .timestamp_discovery import (
54
+ DatetimeOrderAnalyzer,
55
+ TimestampCandidate,
56
+ TimestampDiscoveryEngine,
57
+ TimestampDiscoveryResult,
58
+ TimestampRole,
59
+ )
60
+ from .timestamp_manager import TimestampConfig, TimestampManager, TimestampStrategy
61
+
62
+ # Backwards compatible alias - prefer TEMPORAL_METADATA_COLUMNS
63
+ TEMPORAL_METADATA_COLS = TEMPORAL_METADATA_COLUMNS
64
+
65
+
66
+ def _restore_snapshot_columns(df, findings):
67
+ """Reverse the entity_id/target renames applied by UnifiedDataPreparer."""
68
+ renames = {}
69
+ ts_meta = getattr(findings, "time_series_metadata", None)
70
+ entity_col = ts_meta.entity_column if ts_meta else None
71
+ target_col = getattr(findings, "target_column", None)
72
+
73
+ if entity_col and "entity_id" in df.columns and entity_col not in df.columns:
74
+ renames["entity_id"] = entity_col
75
+ if target_col and "target" in df.columns and target_col not in df.columns:
76
+ renames["target"] = target_col
77
+
78
+ return df.rename(columns=renames) if renames else df
79
+
80
+
81
+ def load_data_with_snapshot_preference(findings, output_dir: str = "../explorations"):
82
+ """Load data preferring snapshots over raw source files.
83
+
84
+ This function implements the recommended data loading pattern for exploration
85
+ notebooks. It checks if a training snapshot exists and loads from it if available,
86
+ otherwise falls back to the original source file.
87
+
88
+ Parameters
89
+ ----------
90
+ findings : ExplorationFindings
91
+ The findings object loaded from a previous exploration
92
+ output_dir : str
93
+ Directory containing explorations and snapshots
94
+
95
+ Returns
96
+ -------
97
+ tuple[pd.DataFrame, str]
98
+ DataFrame and a string indicating the source ("snapshot" or "source")
99
+
100
+ Example
101
+ -------
102
+ >>> from customer_retention.stages.temporal import load_data_with_snapshot_preference
103
+ >>> findings = ExplorationFindings.load(FINDINGS_PATH)
104
+ >>> df, source = load_data_with_snapshot_preference(findings)
105
+ >>> print(f"Loaded from: {source}")
106
+ """
107
+ from pathlib import Path
108
+
109
+ import pandas as pd
110
+
111
+ # Check if snapshot exists in findings
112
+ snapshot_path = getattr(findings, 'snapshot_path', None)
113
+
114
+ if snapshot_path and Path(snapshot_path).exists():
115
+ df = pd.read_parquet(snapshot_path)
116
+ return _restore_snapshot_columns(df, findings), "snapshot"
117
+
118
+ # Check for snapshots in output directory
119
+ output_path = Path(output_dir) / "snapshots"
120
+ if output_path.exists():
121
+ snapshot_manager = SnapshotManager(Path(output_dir))
122
+ snapshots = snapshot_manager.list_snapshots()
123
+ if snapshots:
124
+ latest = snapshot_manager.get_latest_snapshot()
125
+ if latest:
126
+ df, _ = snapshot_manager.load_snapshot(latest)
127
+ return _restore_snapshot_columns(df, findings), f"snapshot:{latest}"
128
+
129
+ # Fall back to source file
130
+ source_path = findings.source_path
131
+ if source_path.endswith('.csv'):
132
+ df = pd.read_csv(source_path)
133
+ else:
134
+ df = pd.read_parquet(source_path)
135
+
136
+ return df, "source"
137
+
138
+
139
+ __all__ = [
140
+ "DatetimeOrderAnalyzer",
141
+ "TimestampStrategy",
142
+ "TimestampConfig",
143
+ "TimestampManager",
144
+ "TimestampRole",
145
+ "TimestampCandidate",
146
+ "TimestampDiscoveryResult",
147
+ "TimestampDiscoveryEngine",
148
+ "SnapshotMetadata",
149
+ "SnapshotManager",
150
+ "PointInTimeJoiner",
151
+ "PreparedData",
152
+ "UnifiedDataPreparer",
153
+ "AccessContext",
154
+ "DataAccessGuard",
155
+ "ScenarioDetector",
156
+ "DatasetSnapshot",
157
+ "ConsistencyReport",
158
+ "PointInTimeRegistry",
159
+ "CutoffAnalysis",
160
+ "CutoffAnalyzer",
161
+ "SplitResult",
162
+ "SyntheticCoordinationParams",
163
+ "SyntheticTimestampCoordinator",
164
+ "load_data_with_snapshot_preference",
165
+ "TEMPORAL_METADATA_COLS",
166
+ ]
@@ -0,0 +1,180 @@
1
+ """Data access control based on execution context.
2
+
3
+ This module provides path-based access control to prevent accidental
4
+ data leakage by restricting which data paths are accessible in different
5
+ execution contexts (exploration, training, inference, etc.).
6
+
7
+ Key concepts:
8
+ - AccessContext: The current execution mode
9
+ - DataAccessGuard: Validates path access against context rules
10
+ - require_context: Decorator to enforce context requirements
11
+
12
+ Example:
13
+ >>> from customer_retention.stages.temporal import AccessContext, DataAccessGuard
14
+ >>> # Set context for the session
15
+ >>> with DataAccessGuard(AccessContext.TRAINING):
16
+ ... # Can access snapshots/ and gold/
17
+ ... df = pd.read_parquet("output/snapshots/training_v1.parquet")
18
+ ... # This would raise PermissionError:
19
+ ... # df = pd.read_parquet("output/raw/customers.csv")
20
+ """
21
+
22
+ import os
23
+ from enum import Enum
24
+ from pathlib import Path
25
+ from typing import Optional
26
+
27
+
28
+ class AccessContext(Enum):
29
+ """Execution context for data access control.
30
+
31
+ Attributes:
32
+ EXPLORATION: Interactive data exploration (can access snapshots)
33
+ TRAINING: Model training (can access snapshots and gold)
34
+ INFERENCE: Production inference (can access gold and feature_store)
35
+ BACKFILL: Historical data processing (can access raw through gold)
36
+ ADMIN: Administrative access (unrestricted)
37
+ """
38
+ EXPLORATION = "exploration"
39
+ TRAINING = "training"
40
+ INFERENCE = "inference"
41
+ BACKFILL = "backfill"
42
+ ADMIN = "admin"
43
+
44
+
45
+ class DataAccessGuard:
46
+ """Guards data access based on the current execution context.
47
+
48
+ The DataAccessGuard prevents accidental data leakage by restricting
49
+ which paths can be accessed based on the execution context. For example,
50
+ during training, raw data paths are blocked to ensure only properly
51
+ prepared snapshots are used.
52
+
53
+ Can be used as a context manager to temporarily set the access context:
54
+
55
+ >>> with DataAccessGuard(AccessContext.TRAINING):
56
+ ... # Only training-appropriate paths accessible here
57
+ ... pass
58
+
59
+ Or used directly for path validation:
60
+
61
+ >>> guard = DataAccessGuard(AccessContext.EXPLORATION)
62
+ >>> guard.validate_access("output/snapshots/v1.parquet") # OK
63
+ >>> guard.validate_access("output/raw/data.csv") # Raises PermissionError
64
+ """
65
+
66
+ ALLOWED_PATHS = {
67
+ AccessContext.EXPLORATION: ["snapshots/"],
68
+ AccessContext.TRAINING: ["snapshots/", "gold/"],
69
+ AccessContext.INFERENCE: ["gold/", "feature_store/"],
70
+ AccessContext.BACKFILL: ["raw/", "bronze/", "silver/", "gold/"],
71
+ AccessContext.ADMIN: ["*"],
72
+ }
73
+
74
+ BLOCKED_PATHS = {
75
+ AccessContext.EXPLORATION: ["raw/", "bronze/", "silver/"],
76
+ AccessContext.TRAINING: ["raw/", "bronze/"],
77
+ AccessContext.INFERENCE: ["snapshots/", "raw/", "bronze/", "silver/"],
78
+ AccessContext.BACKFILL: ["snapshots/"],
79
+ AccessContext.ADMIN: [],
80
+ }
81
+
82
+ def __init__(self, context: AccessContext):
83
+ self.context = context
84
+
85
+ def validate_access(self, path: str) -> bool:
86
+ path_str = str(path)
87
+ for blocked in self.BLOCKED_PATHS[self.context]:
88
+ if blocked in path_str:
89
+ raise PermissionError(
90
+ f"Access to '{path}' blocked in {self.context.value} context. "
91
+ f"Blocked patterns: {self.BLOCKED_PATHS[self.context]}"
92
+ )
93
+ return True
94
+
95
+ def is_allowed(self, path: str) -> bool:
96
+ if "*" in self.ALLOWED_PATHS[self.context]:
97
+ return True
98
+ path_str = str(path)
99
+ return any(allowed in path_str for allowed in self.ALLOWED_PATHS[self.context])
100
+
101
+ def guard_read(self, path: str) -> Path:
102
+ self.validate_access(path)
103
+ return Path(path)
104
+
105
+ @staticmethod
106
+ def set_context(context: AccessContext) -> None:
107
+ os.environ["DATA_ACCESS_CONTEXT"] = context.value
108
+
109
+ @staticmethod
110
+ def get_current_context() -> AccessContext:
111
+ ctx = os.environ.get("DATA_ACCESS_CONTEXT", "exploration")
112
+ return AccessContext(ctx)
113
+
114
+ @classmethod
115
+ def from_environment(cls) -> "DataAccessGuard":
116
+ return cls(cls.get_current_context())
117
+
118
+ def __enter__(self) -> "DataAccessGuard":
119
+ self._previous_context = os.environ.get("DATA_ACCESS_CONTEXT")
120
+ os.environ["DATA_ACCESS_CONTEXT"] = self.context.value
121
+ return self
122
+
123
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
124
+ if self._previous_context:
125
+ os.environ["DATA_ACCESS_CONTEXT"] = self._previous_context
126
+ elif "DATA_ACCESS_CONTEXT" in os.environ:
127
+ del os.environ["DATA_ACCESS_CONTEXT"]
128
+
129
+
130
+ def require_context(*allowed_contexts: AccessContext):
131
+ """Decorator to enforce execution context requirements on functions.
132
+
133
+ Use this decorator to restrict a function to specific execution contexts.
134
+ If called from a disallowed context, raises PermissionError.
135
+
136
+ Args:
137
+ *allowed_contexts: One or more AccessContext values that are permitted
138
+
139
+ Example:
140
+ >>> @require_context(AccessContext.TRAINING, AccessContext.INFERENCE)
141
+ ... def predict(features):
142
+ ... return model.predict(features)
143
+ >>>
144
+ >>> # Only works in TRAINING or INFERENCE context
145
+ >>> DataAccessGuard.set_context(AccessContext.TRAINING)
146
+ >>> predict(X) # OK
147
+ >>> DataAccessGuard.set_context(AccessContext.EXPLORATION)
148
+ >>> predict(X) # Raises PermissionError
149
+ """
150
+ def decorator(func):
151
+ def wrapper(*args, **kwargs):
152
+ current = DataAccessGuard.get_current_context()
153
+ if current not in allowed_contexts:
154
+ raise PermissionError(
155
+ f"Function requires context {[c.value for c in allowed_contexts]}, "
156
+ f"but current context is {current.value}"
157
+ )
158
+ return func(*args, **kwargs)
159
+ return wrapper
160
+ return decorator
161
+
162
+
163
+ def guarded_read(path: str, context: Optional[AccessContext] = None) -> Path:
164
+ """Validate path access and return a Path object.
165
+
166
+ Convenience function that validates a path against access rules
167
+ and returns a Path object if access is allowed.
168
+
169
+ Args:
170
+ path: Path to validate
171
+ context: Optional context override (uses environment if None)
172
+
173
+ Returns:
174
+ Path object for the validated path
175
+
176
+ Raises:
177
+ PermissionError: If access is not allowed in the current context
178
+ """
179
+ guard = DataAccessGuard(context) if context else DataAccessGuard.from_environment()
180
+ return guard.guard_read(path)
@@ -0,0 +1,235 @@
1
+ import warnings
2
+ from dataclasses import dataclass, field
3
+ from datetime import datetime
4
+ from typing import Optional
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ from customer_retention.stages.temporal.timestamp_discovery import DatetimeOrderAnalyzer
10
+
11
+
12
+ @dataclass
13
+ class SplitResult:
14
+ train_df: pd.DataFrame
15
+ score_df: pd.DataFrame
16
+ unresolvable_df: pd.DataFrame
17
+ cutoff_date: datetime
18
+ timestamp_source: str
19
+ train_count: int
20
+ score_count: int
21
+ unresolvable_count: int
22
+ original_count: int
23
+
24
+
25
+ @dataclass
26
+ class CutoffAnalysis:
27
+ timestamp_column: str
28
+ total_rows: int
29
+ bins: list[datetime]
30
+ bin_counts: list[int]
31
+ train_percentages: list[float]
32
+ score_percentages: list[float]
33
+ date_range: tuple[datetime, datetime]
34
+ source_rows: int = 0
35
+ covered_rows: int = 0
36
+ resolved_timestamp_series: Optional[pd.Series] = field(default=None, repr=False)
37
+ _source_df: Optional[pd.DataFrame] = field(default=None, repr=False)
38
+
39
+ @property
40
+ def coverage_ratio(self) -> float:
41
+ return self.covered_rows / self.source_rows if self.source_rows > 0 else 0.0
42
+
43
+ def suggest_cutoff(self, train_ratio: float = 0.9) -> datetime:
44
+ target_pct = train_ratio * 100
45
+ for i, train_pct in enumerate(self.train_percentages):
46
+ if train_pct >= target_pct:
47
+ return self.bins[i]
48
+ return self.bins[-1]
49
+
50
+ def get_train_percentage(self, cutoff_date: datetime) -> float:
51
+ for i, bin_date in enumerate(self.bins):
52
+ if bin_date >= cutoff_date:
53
+ return self.train_percentages[max(0, i - 1)]
54
+ return self.train_percentages[-1]
55
+
56
+ def get_split_at_date(self, cutoff_date: datetime) -> dict:
57
+ train_pct = self.get_train_percentage(cutoff_date)
58
+ train_count = int(self.total_rows * train_pct / 100)
59
+ return {
60
+ "train_count": train_count,
61
+ "score_count": self.total_rows - train_count,
62
+ "train_pct": train_pct,
63
+ "score_pct": 100 - train_pct,
64
+ }
65
+
66
+ def split_at_cutoff(self, cutoff_date: Optional[datetime] = None) -> "SplitResult":
67
+ if self.resolved_timestamp_series is None:
68
+ raise ValueError(
69
+ "No resolved timestamp series available. "
70
+ "Re-run analyze() to populate resolved_timestamp_series."
71
+ )
72
+ if self._source_df is None:
73
+ raise ValueError(
74
+ "No source DataFrame available. "
75
+ "Re-run analyze() to populate the source reference."
76
+ )
77
+
78
+ cutoff = cutoff_date or self.suggest_cutoff()
79
+ ts = self.resolved_timestamp_series
80
+ df = self._source_df
81
+
82
+ not_null_mask = ts.notna()
83
+ train_mask = not_null_mask & (ts <= cutoff)
84
+ score_mask = not_null_mask & (ts > cutoff)
85
+ unresolvable_mask = ~not_null_mask
86
+
87
+ train_df = df.loc[train_mask]
88
+ score_df = df.loc[score_mask]
89
+ unresolvable_df = df.loc[unresolvable_mask]
90
+
91
+ assert len(train_df) + len(score_df) + len(unresolvable_df) == len(df), (
92
+ f"Data loss detected: train({len(train_df)}) + score({len(score_df)}) + "
93
+ f"unresolvable({len(unresolvable_df)}) != original({len(df)})"
94
+ )
95
+
96
+ return SplitResult(
97
+ train_df=train_df,
98
+ score_df=score_df,
99
+ unresolvable_df=unresolvable_df,
100
+ cutoff_date=cutoff,
101
+ timestamp_source=self.timestamp_column,
102
+ train_count=len(train_df),
103
+ score_count=len(score_df),
104
+ unresolvable_count=len(unresolvable_df),
105
+ original_count=len(df),
106
+ )
107
+
108
+ def to_dataframe(self) -> pd.DataFrame:
109
+ cumulative = np.cumsum(self.bin_counts)
110
+ return pd.DataFrame({
111
+ "date": self.bins,
112
+ "bin_count": self.bin_counts,
113
+ "cumulative_count": cumulative,
114
+ "train_pct": self.train_percentages,
115
+ "score_pct": self.score_percentages,
116
+ })
117
+
118
+ def get_percentage_milestones(self, step: int = 5) -> list[dict]:
119
+ milestones = []
120
+ target_pcts = list(range(step, 100, step))
121
+ for target in target_pcts:
122
+ for i, train_pct in enumerate(self.train_percentages):
123
+ if train_pct >= target:
124
+ milestones.append({
125
+ "date": self.bins[i],
126
+ "train_pct": round(train_pct, 1),
127
+ "score_pct": round(100 - train_pct, 1),
128
+ })
129
+ break
130
+ return milestones
131
+
132
+
133
+ class CutoffAnalyzer:
134
+ TIMESTAMP_PATTERNS = ["feature_timestamp", "label_timestamp", "timestamp", "date", "datetime"]
135
+
136
+ def __init__(self):
137
+ self._datetime_analyzer = DatetimeOrderAnalyzer()
138
+
139
+ def analyze(
140
+ self,
141
+ df: pd.DataFrame,
142
+ timestamp_column: Optional[str] = None,
143
+ n_bins: int = 20,
144
+ timestamp_series: Optional[pd.Series] = None,
145
+ ) -> CutoffAnalysis:
146
+ source_rows = len(df)
147
+ ts_col, full_series = self._resolve_timestamp_series_full(df, timestamp_column, timestamp_series)
148
+ ts_series = full_series.dropna()
149
+
150
+ if len(ts_series) == 0:
151
+ return self._empty_analysis(ts_col, source_rows=source_rows, df=df, full_series=full_series)
152
+
153
+ covered_rows = len(ts_series)
154
+ coverage_ratio = covered_rows / source_rows if source_rows > 0 else 0.0
155
+ if coverage_ratio < 0.5:
156
+ warnings.warn(
157
+ f"Low timestamp coverage: {covered_rows}/{source_rows} rows "
158
+ f"({coverage_ratio:.1%}). Results may not represent the full dataset.",
159
+ stacklevel=2,
160
+ )
161
+
162
+ bins, counts = self._compute_bins(ts_series, n_bins)
163
+ train_pcts, score_pcts = self._compute_percentages(counts)
164
+
165
+ return CutoffAnalysis(
166
+ timestamp_column=ts_col,
167
+ total_rows=len(ts_series),
168
+ bins=bins,
169
+ bin_counts=counts,
170
+ train_percentages=train_pcts,
171
+ score_percentages=score_pcts,
172
+ date_range=(ts_series.min(), ts_series.max()),
173
+ source_rows=source_rows,
174
+ covered_rows=covered_rows,
175
+ resolved_timestamp_series=full_series,
176
+ _source_df=df,
177
+ )
178
+
179
+ def _resolve_timestamp_series_full(
180
+ self,
181
+ df: pd.DataFrame,
182
+ timestamp_column: Optional[str],
183
+ timestamp_series: Optional[pd.Series],
184
+ ) -> tuple[str, pd.Series]:
185
+ if timestamp_series is not None:
186
+ ts_col = timestamp_series.name or "timestamp_series"
187
+ series = self._ensure_datetime_series_full(timestamp_series)
188
+ return ts_col, series
189
+ ts_col = timestamp_column or self._detect_timestamp_column(df)
190
+ series = self._ensure_datetime_series_full(df[ts_col])
191
+ return ts_col, series
192
+
193
+ def _detect_timestamp_column(self, df: pd.DataFrame) -> str:
194
+ datetime_cols = self._datetime_analyzer._get_datetime_columns(df)
195
+ for pattern in self.TIMESTAMP_PATTERNS:
196
+ for col in datetime_cols:
197
+ if pattern in col.lower():
198
+ return col
199
+ if datetime_cols:
200
+ return datetime_cols[0]
201
+ raise ValueError("No timestamp column found")
202
+
203
+ def _ensure_datetime_series_full(self, series: pd.Series) -> pd.Series:
204
+ if pd.api.types.is_datetime64_any_dtype(series):
205
+ return series
206
+ return pd.to_datetime(series, format="mixed", errors="coerce")
207
+
208
+ def _compute_bins(self, ts_series: pd.Series, n_bins: int) -> tuple[list[datetime], list[int]]:
209
+ if ts_series.nunique() == 1:
210
+ return [ts_series.iloc[0].to_pydatetime()], [len(ts_series)]
211
+
212
+ bin_edges = pd.date_range(ts_series.min(), ts_series.max(), periods=n_bins + 1)
213
+ counts, _ = np.histogram(ts_series, bins=bin_edges)
214
+ bin_centers = [edge.to_pydatetime() for edge in bin_edges[:-1]]
215
+ return bin_centers, counts.tolist()
216
+
217
+ def _compute_percentages(self, counts: list[int]) -> tuple[list[float], list[float]]:
218
+ total = sum(counts)
219
+ if total == 0:
220
+ return [0.0] * len(counts), [100.0] * len(counts)
221
+
222
+ cumulative = np.cumsum(counts)
223
+ train_pcts = (cumulative / total * 100).tolist()
224
+ score_pcts = [100 - p for p in train_pcts]
225
+ return train_pcts, score_pcts
226
+
227
+ def _empty_analysis(self, ts_col: str, source_rows: int = 0, df: Optional[pd.DataFrame] = None, full_series: Optional[pd.Series] = None) -> CutoffAnalysis:
228
+ return CutoffAnalysis(
229
+ timestamp_column=ts_col, total_rows=0, bins=[], bin_counts=[],
230
+ train_percentages=[], score_percentages=[],
231
+ date_range=(datetime.now(), datetime.now()),
232
+ source_rows=source_rows, covered_rows=0,
233
+ resolved_timestamp_series=full_series,
234
+ _source_df=df,
235
+ )