churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,259 @@
1
+ """Versioned training snapshot management with integrity verification.
2
+
3
+ This module provides infrastructure for creating, versioning, and verifying
4
+ training data snapshots. Each snapshot includes:
5
+
6
+ - Point-in-time filtered data (only label_available=True records)
7
+ - SHA256 integrity hash for tamper detection
8
+ - Comprehensive metadata (cutoff date, feature columns, row counts)
9
+ - Version tracking for reproducibility
10
+
11
+ Example:
12
+ >>> from customer_retention.stages.temporal import SnapshotManager
13
+ >>> from datetime import datetime
14
+ >>>
15
+ >>> manager = SnapshotManager(base_path="./output")
16
+ >>> metadata = manager.create_snapshot(
17
+ ... df=prepared_df,
18
+ ... cutoff_date=datetime(2024, 6, 1),
19
+ ... target_column="churn"
20
+ ... )
21
+ >>> print(f"Created {metadata.snapshot_id} with hash {metadata.data_hash}")
22
+ >>>
23
+ >>> # Load with integrity verification
24
+ >>> df, meta = manager.load_snapshot("training_v1")
25
+ """
26
+
27
+ import hashlib
28
+ import json
29
+ from dataclasses import dataclass
30
+ from datetime import datetime
31
+ from pathlib import Path
32
+ from typing import Any, Optional
33
+
34
+ import pandas as pd
35
+
36
+
37
+ @dataclass
38
+ class SnapshotMetadata:
39
+ """Metadata for a training data snapshot.
40
+
41
+ Attributes:
42
+ snapshot_id: Unique identifier (e.g., "training_v1")
43
+ version: Numeric version number
44
+ created_at: When the snapshot was created
45
+ cutoff_date: Point-in-time cutoff for label availability
46
+ label_availability_filter: Filter expression used
47
+ row_count: Number of rows in the snapshot
48
+ column_count: Number of columns in the snapshot
49
+ data_hash: SHA256 hash of the data for integrity verification
50
+ feature_columns: List of feature column names
51
+ target_column: Name of the target column
52
+ timestamp_config: Configuration used for timestamp handling
53
+ """
54
+ snapshot_id: str
55
+ version: int
56
+ created_at: datetime
57
+ cutoff_date: datetime
58
+ label_availability_filter: str
59
+ row_count: int
60
+ column_count: int
61
+ data_hash: str
62
+ feature_columns: list[str]
63
+ target_column: str
64
+ timestamp_config: dict[str, Any]
65
+
66
+
67
+ class SnapshotManager:
68
+ """Manages versioned training data snapshots with integrity verification.
69
+
70
+ The SnapshotManager creates point-in-time correct training snapshots,
71
+ maintaining version history and providing SHA256 integrity verification
72
+ on load to detect any data modifications.
73
+
74
+ Example:
75
+ >>> manager = SnapshotManager(base_path="./output")
76
+ >>> # Create a new snapshot
77
+ >>> meta = manager.create_snapshot(df, cutoff_date, "churn")
78
+ >>> # List all snapshots
79
+ >>> print(manager.list_snapshots()) # ["training_v1", "training_v2"]
80
+ >>> # Load with verification
81
+ >>> df, meta = manager.load_snapshot("training_v1")
82
+ """
83
+
84
+ def __init__(self, base_path: Path, storage=None):
85
+ self.base_path = Path(base_path)
86
+ self.snapshots_dir = self.base_path / "snapshots"
87
+ self.snapshots_dir.mkdir(parents=True, exist_ok=True)
88
+ self.storage = storage or _get_storage()
89
+
90
+ def create_snapshot(
91
+ self, df: pd.DataFrame, cutoff_date: datetime, target_column: str,
92
+ snapshot_name: str = "training", timestamp_series: Optional[pd.Series] = None,
93
+ ) -> SnapshotMetadata:
94
+ if timestamp_series is not None:
95
+ ts = timestamp_series
96
+ else:
97
+ ts = df["feature_timestamp"]
98
+ snapshot_df = df[
99
+ (df["label_available_flag"] == True) & (ts <= cutoff_date)
100
+ ].copy()
101
+
102
+ table_path = self.snapshots_dir / snapshot_name
103
+ version = self._next_version(table_path, snapshot_name)
104
+ snapshot_id = f"{snapshot_name}_v{version}"
105
+ data_hash = self._compute_hash(snapshot_df, cutoff_date)
106
+
107
+ metadata_cols = ["feature_timestamp", "label_timestamp", "label_available_flag"]
108
+ feature_cols = [c for c in snapshot_df.columns if c not in metadata_cols and c != target_column]
109
+
110
+ self._write_snapshot(snapshot_df, str(table_path), snapshot_id)
111
+
112
+ metadata = SnapshotMetadata(
113
+ snapshot_id=snapshot_id, version=version, created_at=datetime.now(),
114
+ cutoff_date=cutoff_date, label_availability_filter="label_available_flag == True AND feature_timestamp <= cutoff",
115
+ row_count=len(snapshot_df), column_count=len(snapshot_df.columns),
116
+ data_hash=data_hash, feature_columns=feature_cols, target_column=target_column,
117
+ timestamp_config={"cutoff_date": cutoff_date.isoformat(), "label_available_only": True},
118
+ )
119
+
120
+ self._save_metadata(metadata, snapshot_id)
121
+ return metadata
122
+
123
+ def load_snapshot(self, snapshot_id: str) -> tuple[pd.DataFrame, SnapshotMetadata]:
124
+ snapshot_name, version = self._parse_snapshot_id(snapshot_id)
125
+ table_path = self.snapshots_dir / snapshot_name
126
+
127
+ if self.storage and self.storage.exists(str(table_path)):
128
+ delta_version = version - 1
129
+ df = self.storage.read(str(table_path), version=delta_version)
130
+ else:
131
+ parquet_path = self.snapshots_dir / f"{snapshot_id}.parquet"
132
+ if not parquet_path.exists():
133
+ raise FileNotFoundError(f"Snapshot not found: {snapshot_id}")
134
+ df = pd.read_parquet(parquet_path)
135
+
136
+ metadata = self._load_metadata(snapshot_id)
137
+
138
+ current_hash = self._compute_hash(df, metadata.cutoff_date)
139
+ if current_hash != metadata.data_hash:
140
+ raise ValueError(f"Snapshot integrity check failed for {snapshot_id}. Cutoff date or data may have changed.")
141
+
142
+ return df, metadata
143
+
144
+ def list_snapshots(self) -> list[str]:
145
+ snapshots = []
146
+ for p in self.snapshots_dir.glob("*_v*.parquet"):
147
+ snapshots.append(p.stem)
148
+ if self.storage:
149
+ for subdir in self.snapshots_dir.iterdir():
150
+ if subdir.is_dir() and self.storage.exists(str(subdir)):
151
+ history_len = len(self.storage.history(str(subdir)))
152
+ for v in range(1, history_len + 1):
153
+ sid = f"{subdir.name}_v{v}"
154
+ if sid not in snapshots:
155
+ snapshots.append(sid)
156
+ return snapshots
157
+
158
+ def get_latest_snapshot(self, snapshot_name: str = "training") -> Optional[str]:
159
+ snapshots = [s for s in self.list_snapshots() if s.startswith(f"{snapshot_name}_v")]
160
+ if not snapshots:
161
+ return None
162
+ return sorted(snapshots, key=lambda s: int(s.split("_v")[-1]))[-1]
163
+
164
+ def compare_snapshots(self, snapshot_id_1: str, snapshot_id_2: str) -> dict[str, Any]:
165
+ _, meta1 = self.load_snapshot(snapshot_id_1)
166
+ _, meta2 = self.load_snapshot(snapshot_id_2)
167
+
168
+ return {
169
+ "snapshot_1": snapshot_id_1,
170
+ "snapshot_2": snapshot_id_2,
171
+ "row_diff": meta2.row_count - meta1.row_count,
172
+ "column_diff": meta2.column_count - meta1.column_count,
173
+ "cutoff_1": meta1.cutoff_date,
174
+ "cutoff_2": meta2.cutoff_date,
175
+ "new_features": set(meta2.feature_columns) - set(meta1.feature_columns),
176
+ "removed_features": set(meta1.feature_columns) - set(meta2.feature_columns),
177
+ }
178
+
179
+ def _write_snapshot(self, df: pd.DataFrame, table_path: str, snapshot_id: str) -> None:
180
+ if self.storage and len(df) > 0:
181
+ metadata = {"snapshot_id": snapshot_id, "created_at": datetime.now().isoformat()}
182
+ self.storage.write(df.reset_index(drop=True), table_path, mode="overwrite", metadata=metadata)
183
+ else:
184
+ parquet_path = Path(table_path).parent / f"{snapshot_id}.parquet"
185
+ df.to_parquet(parquet_path, index=False)
186
+
187
+ def _next_version(self, table_path: Path, snapshot_name: str) -> int:
188
+ if self.storage and self.storage.exists(str(table_path)):
189
+ return len(self.storage.history(str(table_path))) + 1
190
+ parquet_count = len(list(self.snapshots_dir.glob(f"{snapshot_name}_v*.parquet")))
191
+ if self.storage and table_path.is_dir():
192
+ return len(self.storage.history(str(table_path))) + 1
193
+ return parquet_count + 1
194
+
195
+ def _parse_snapshot_id(self, snapshot_id: str) -> tuple[str, int]:
196
+ parts = snapshot_id.rsplit("_v", 1)
197
+ return parts[0], int(parts[1])
198
+
199
+ def _compute_hash(self, df: pd.DataFrame, cutoff_date: Optional[datetime] = None) -> str:
200
+ df_stable = df.reset_index(drop=True).copy()
201
+ for col in df_stable.columns:
202
+ if pd.api.types.is_datetime64_any_dtype(df_stable[col]):
203
+ df_stable[col] = df_stable[col].dt.floor("us").astype(str)
204
+ for col in df_stable.columns:
205
+ if pd.api.types.is_extension_array_dtype(df_stable[col]):
206
+ df_stable[col] = df_stable[col].astype(object)
207
+ df_stable = df_stable[sorted(df_stable.columns)]
208
+
209
+ data_bytes = pd.util.hash_pandas_object(df_stable).values.tobytes()
210
+ if cutoff_date:
211
+ normalized = datetime.fromisoformat(cutoff_date.isoformat())
212
+ data_bytes += normalized.isoformat().encode("utf-8")
213
+
214
+ return hashlib.sha256(data_bytes).hexdigest()[:16]
215
+
216
+ def _save_metadata(self, metadata: SnapshotMetadata, snapshot_id: str) -> None:
217
+ metadata_path = self.snapshots_dir / f"{snapshot_id}_metadata.json"
218
+ metadata_dict = {
219
+ "snapshot_id": metadata.snapshot_id,
220
+ "version": metadata.version,
221
+ "created_at": metadata.created_at.isoformat(),
222
+ "cutoff_date": metadata.cutoff_date.isoformat(),
223
+ "label_availability_filter": metadata.label_availability_filter,
224
+ "row_count": metadata.row_count,
225
+ "column_count": metadata.column_count,
226
+ "data_hash": metadata.data_hash,
227
+ "feature_columns": metadata.feature_columns,
228
+ "target_column": metadata.target_column,
229
+ "timestamp_config": metadata.timestamp_config,
230
+ }
231
+ with open(metadata_path, "w") as f:
232
+ json.dump(metadata_dict, f, indent=2)
233
+
234
+ def _load_metadata(self, snapshot_id: str) -> SnapshotMetadata:
235
+ metadata_path = self.snapshots_dir / f"{snapshot_id}_metadata.json"
236
+ with open(metadata_path) as f:
237
+ data = json.load(f)
238
+
239
+ return SnapshotMetadata(
240
+ snapshot_id=data["snapshot_id"],
241
+ version=data["version"],
242
+ created_at=datetime.fromisoformat(data["created_at"]),
243
+ cutoff_date=datetime.fromisoformat(data["cutoff_date"]),
244
+ label_availability_filter=data["label_availability_filter"],
245
+ row_count=data["row_count"],
246
+ column_count=data["column_count"],
247
+ data_hash=data["data_hash"],
248
+ feature_columns=data["feature_columns"],
249
+ target_column=data["target_column"],
250
+ timestamp_config=data["timestamp_config"],
251
+ )
252
+
253
+
254
+ def _get_storage():
255
+ try:
256
+ from customer_retention.integrations.adapters.factory import get_delta
257
+ return get_delta(force_local=True)
258
+ except ImportError:
259
+ return None
@@ -0,0 +1,66 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ from .point_in_time_registry import PointInTimeRegistry
5
+ from .timestamp_manager import TimestampConfig, TimestampStrategy
6
+
7
+
8
+ @dataclass
9
+ class SyntheticCoordinationParams:
10
+ base_date: str = "2024-01-01"
11
+ range_days: int = 365
12
+ observation_window_days: int = 90
13
+ strategy: TimestampStrategy = TimestampStrategy.SYNTHETIC_INDEX
14
+
15
+
16
+ class SyntheticTimestampCoordinator:
17
+ def __init__(self, params: Optional[SyntheticCoordinationParams] = None):
18
+ self._params = params or SyntheticCoordinationParams()
19
+ self._registered_configs: dict[str, TimestampConfig] = {}
20
+
21
+ def create_config(self, dataset_name: str) -> TimestampConfig:
22
+ config = TimestampConfig(
23
+ strategy=self._params.strategy,
24
+ synthetic_base_date=self._params.base_date,
25
+ synthetic_range_days=self._params.range_days,
26
+ observation_window_days=self._params.observation_window_days,
27
+ )
28
+ self._registered_configs[dataset_name] = config
29
+ return config
30
+
31
+ @property
32
+ def registered_datasets(self) -> list[str]:
33
+ return list(self._registered_configs.keys())
34
+
35
+ def validate_compatibility(self) -> tuple[bool, str]:
36
+ if len(self._registered_configs) <= 1:
37
+ return True, "compatible"
38
+ configs = list(self._registered_configs.values())
39
+ reference = configs[0]
40
+ for name, config in self._registered_configs.items():
41
+ if config.synthetic_base_date != reference.synthetic_base_date:
42
+ return False, (
43
+ f"Incompatible base_date: '{name}' has {config.synthetic_base_date}, "
44
+ f"expected {reference.synthetic_base_date}"
45
+ )
46
+ if config.observation_window_days != reference.observation_window_days:
47
+ return False, (
48
+ f"Incompatible observation_window_days: '{name}' has {config.observation_window_days}, "
49
+ f"expected {reference.observation_window_days}"
50
+ )
51
+ if config.strategy != reference.strategy:
52
+ return False, (
53
+ f"Incompatible strategy: '{name}' has {config.strategy.value}, "
54
+ f"expected {reference.strategy.value}"
55
+ )
56
+ return True, "compatible"
57
+
58
+ @classmethod
59
+ def from_registry(cls, registry: PointInTimeRegistry) -> "SyntheticTimestampCoordinator":
60
+ reference_cutoff = registry.get_reference_cutoff()
61
+ if reference_cutoff is None:
62
+ return cls()
63
+ params = SyntheticCoordinationParams(
64
+ base_date=reference_cutoff.strftime("%Y-%m-%d"),
65
+ )
66
+ return cls(params)