churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,744 @@
1
+ """Unified feature store manager for leakage-safe feature management.
2
+
3
+ This module provides a unified interface for feature store operations
4
+ that works with both Feast (local) and Databricks (production) backends.
5
+ """
6
+
7
+ import hashlib
8
+ import json
9
+ from abc import ABC, abstractmethod
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import Any, Optional
13
+
14
+ import pandas as pd
15
+
16
+ from customer_retention.stages.temporal import PointInTimeRegistry, SnapshotManager
17
+
18
+ from .registry import FeatureRegistry
19
+
20
+
21
+ class FeatureStoreBackend(ABC):
22
+ """Abstract base class for feature store backends."""
23
+
24
+ @abstractmethod
25
+ def create_feature_table(
26
+ self,
27
+ name: str,
28
+ entity_key: str,
29
+ timestamp_column: str,
30
+ schema: dict[str, str],
31
+ cutoff_date: Optional[datetime] = None,
32
+ ) -> str:
33
+ pass
34
+
35
+ @abstractmethod
36
+ def write_features(
37
+ self,
38
+ table_name: str,
39
+ df: pd.DataFrame,
40
+ mode: str = "merge",
41
+ cutoff_date: Optional[datetime] = None,
42
+ ) -> None:
43
+ pass
44
+
45
+ @abstractmethod
46
+ def get_historical_features(
47
+ self,
48
+ entity_df: pd.DataFrame,
49
+ feature_refs: list[str],
50
+ timestamp_column: str = "event_timestamp",
51
+ ) -> pd.DataFrame:
52
+ """Get point-in-time correct historical features."""
53
+ pass
54
+
55
+ @abstractmethod
56
+ def get_online_features(
57
+ self,
58
+ entity_keys: dict[str, list[Any]],
59
+ feature_refs: list[str],
60
+ ) -> dict[str, Any]:
61
+ """Get features for online serving."""
62
+ pass
63
+
64
+ @abstractmethod
65
+ def list_tables(self) -> list[str]:
66
+ """List all feature tables."""
67
+ pass
68
+
69
+
70
+ class FeastBackend(FeatureStoreBackend):
71
+
72
+ def __init__(self, repo_path: str = "./feature_store/feature_repo"):
73
+ self.repo_path = Path(repo_path)
74
+ self.repo_path.mkdir(parents=True, exist_ok=True)
75
+ self._store = None
76
+ self._tables: dict[str, dict] = {}
77
+ self._load_table_metadata()
78
+ self.storage = _get_storage()
79
+
80
+ @property
81
+ def store(self):
82
+ """Lazy-load Feast store."""
83
+ if self._store is None:
84
+ try:
85
+ from feast import FeatureStore
86
+ self._store = FeatureStore(repo_path=str(self.repo_path))
87
+ except ImportError:
88
+ raise ImportError("Feast is required. Install with: pip install feast")
89
+ return self._store
90
+
91
+ def create_feature_table(
92
+ self,
93
+ name: str,
94
+ entity_key: str,
95
+ timestamp_column: str,
96
+ schema: dict[str, str],
97
+ cutoff_date: Optional[datetime] = None,
98
+ ) -> str:
99
+ self._tables[name] = {
100
+ "entity_key": entity_key,
101
+ "timestamp_column": timestamp_column,
102
+ "schema": schema,
103
+ "cutoff_date": cutoff_date.isoformat() if cutoff_date else None,
104
+ "created_at": datetime.now().isoformat(),
105
+ }
106
+ self._save_table_metadata()
107
+ return name
108
+
109
+ def _load_table_metadata(self) -> None:
110
+ metadata_path = self.repo_path / "feature_tables_metadata.json"
111
+ if metadata_path.exists():
112
+ with open(metadata_path) as f:
113
+ self._tables = json.load(f)
114
+
115
+ def _save_table_metadata(self) -> None:
116
+ metadata_path = self.repo_path / "feature_tables_metadata.json"
117
+ with open(metadata_path, "w") as f:
118
+ json.dump(self._tables, f, indent=2)
119
+
120
+ def _compute_feature_hash(self, df: pd.DataFrame, cutoff_date: Optional[datetime] = None) -> str:
121
+ df_stable = df.reset_index(drop=True).copy()
122
+ for col in df_stable.select_dtypes(include=["datetime64", "datetime64[ns]"]).columns:
123
+ df_stable[col] = df_stable[col].astype(str)
124
+ df_stable = df_stable[sorted(df_stable.columns)]
125
+ data_bytes = pd.util.hash_pandas_object(df_stable).values.tobytes()
126
+ if cutoff_date:
127
+ data_bytes += cutoff_date.isoformat().encode("utf-8")
128
+ return hashlib.sha256(data_bytes).hexdigest()[:16]
129
+
130
+ def get_table_cutoff_date(self, name: str) -> Optional[datetime]:
131
+ if name not in self._tables:
132
+ return None
133
+ cutoff_str = self._tables[name].get("cutoff_date")
134
+ return datetime.fromisoformat(cutoff_str) if cutoff_str else None
135
+
136
+ def validate_cutoff_consistency(self, proposed_cutoff: datetime) -> tuple[bool, str]:
137
+ existing_cutoffs = {
138
+ name: self.get_table_cutoff_date(name)
139
+ for name in self._tables
140
+ if self.get_table_cutoff_date(name) is not None
141
+ }
142
+ if not existing_cutoffs:
143
+ return True, "First feature table - cutoff date will be set as reference"
144
+
145
+ reference_date = next(iter(existing_cutoffs.values())).date()
146
+ if proposed_cutoff.date() != reference_date:
147
+ return False, (
148
+ f"Cutoff mismatch. Existing tables use {reference_date}. "
149
+ f"Proposed: {proposed_cutoff.date()}. All feature tables must use same cutoff."
150
+ )
151
+ return True, f"Cutoff date matches reference: {reference_date}"
152
+
153
+ def write_features(
154
+ self,
155
+ table_name: str,
156
+ df: pd.DataFrame,
157
+ mode: str = "merge",
158
+ cutoff_date: Optional[datetime] = None,
159
+ ) -> None:
160
+ delta_path = self.repo_path / "data" / table_name
161
+ parquet_path = self.repo_path / "data" / f"{table_name}.parquet"
162
+ delta_path.parent.mkdir(parents=True, exist_ok=True)
163
+
164
+ if self.storage:
165
+ if mode == "merge" and self.storage.exists(str(delta_path)):
166
+ entity_key = self._tables.get(table_name, {}).get("entity_key", "entity_id")
167
+ condition = f"source.{entity_key} = target.{entity_key}"
168
+ self.storage.merge(df, str(delta_path), condition=condition)
169
+ df = self.storage.read(str(delta_path))
170
+ else:
171
+ self.storage.write(df, str(delta_path))
172
+ else:
173
+ if mode == "merge" and parquet_path.exists():
174
+ existing = pd.read_parquet(parquet_path)
175
+ if table_name in self._tables:
176
+ entity_key = self._tables[table_name]["entity_key"]
177
+ df = pd.concat([existing, df]).drop_duplicates(subset=[entity_key], keep="last")
178
+ df.to_parquet(parquet_path, index=False)
179
+
180
+ effective_cutoff = cutoff_date or (
181
+ datetime.fromisoformat(self._tables[table_name]["cutoff_date"])
182
+ if table_name in self._tables and self._tables[table_name].get("cutoff_date")
183
+ else None
184
+ )
185
+
186
+ if table_name in self._tables:
187
+ self._tables[table_name]["data_hash"] = self._compute_feature_hash(df, effective_cutoff)
188
+ self._tables[table_name]["row_count"] = len(df)
189
+ self._tables[table_name]["updated_at"] = datetime.now().isoformat()
190
+ self._save_table_metadata()
191
+
192
+ def get_historical_features(
193
+ self,
194
+ entity_df: pd.DataFrame,
195
+ feature_refs: list[str],
196
+ timestamp_column: str = "event_timestamp",
197
+ ) -> pd.DataFrame:
198
+ """Get point-in-time correct historical features using Feast."""
199
+ try:
200
+ return self.store.get_historical_features(
201
+ entity_df=entity_df,
202
+ features=feature_refs,
203
+ ).to_df()
204
+ except Exception:
205
+ # Fallback: manual PIT join from parquet files
206
+ return self._manual_pit_join(entity_df, feature_refs, timestamp_column)
207
+
208
+ def _manual_pit_join(
209
+ self,
210
+ entity_df: pd.DataFrame,
211
+ feature_refs: list[str],
212
+ timestamp_column: str,
213
+ ) -> pd.DataFrame:
214
+ """Manual point-in-time join when Feast is not configured."""
215
+ result = entity_df.copy()
216
+
217
+ for ref in feature_refs:
218
+ parts = ref.split(":")
219
+ if len(parts) != 2:
220
+ continue
221
+
222
+ table_name, feature_name = parts
223
+ feature_df = self._read_table_data(table_name)
224
+ if feature_df is None:
225
+ continue
226
+ if feature_name not in feature_df.columns:
227
+ continue
228
+
229
+ # Get entity key from table metadata
230
+ entity_key = self._tables.get(table_name, {}).get("entity_key", "entity_id")
231
+ ts_col = self._tables.get(table_name, {}).get("timestamp_column", "feature_timestamp")
232
+
233
+ if ts_col in feature_df.columns and timestamp_column in entity_df.columns:
234
+ # Point-in-time join
235
+ merged = result.merge(
236
+ feature_df[[entity_key, ts_col, feature_name]],
237
+ on=entity_key,
238
+ how="left",
239
+ )
240
+ # Keep only features from before the entity timestamp
241
+ valid = merged[merged[ts_col] <= merged[timestamp_column]]
242
+ # Take latest valid feature per entity
243
+ valid = valid.sort_values(ts_col).groupby(entity_key).last().reset_index()
244
+ result = result.merge(
245
+ valid[[entity_key, feature_name]],
246
+ on=entity_key,
247
+ how="left",
248
+ )
249
+ else:
250
+ # Simple join without PIT
251
+ result = result.merge(
252
+ feature_df[[entity_key, feature_name]],
253
+ on=entity_key,
254
+ how="left",
255
+ )
256
+
257
+ return result
258
+
259
+ def get_online_features(
260
+ self,
261
+ entity_keys: dict[str, list[Any]],
262
+ feature_refs: list[str],
263
+ ) -> dict[str, Any]:
264
+ """Get features for online serving."""
265
+ try:
266
+ entity_rows = [
267
+ {k: v[i] for k, v in entity_keys.items()}
268
+ for i in range(len(next(iter(entity_keys.values()))))
269
+ ]
270
+ return self.store.get_online_features(
271
+ features=feature_refs,
272
+ entity_rows=entity_rows,
273
+ ).to_dict()
274
+ except Exception:
275
+ # Fallback: read latest from parquet
276
+ entity_df = pd.DataFrame(entity_keys)
277
+ result = self.get_historical_features(
278
+ entity_df, feature_refs, "event_timestamp"
279
+ )
280
+ return result.to_dict("list")
281
+
282
+ def list_tables(self) -> list[str]:
283
+ """List all feature tables."""
284
+ data_dir = self.repo_path / "data"
285
+ if not data_dir.exists():
286
+ return []
287
+ tables = [p.stem for p in data_dir.glob("*.parquet")]
288
+ if self.storage:
289
+ for subdir in data_dir.iterdir():
290
+ if subdir.is_dir() and self.storage.exists(str(subdir)) and subdir.name not in tables:
291
+ tables.append(subdir.name)
292
+ return tables
293
+
294
+ def _read_table_data(self, table_name: str) -> Optional[pd.DataFrame]:
295
+ delta_path = self.repo_path / "data" / table_name
296
+ parquet_path = self.repo_path / "data" / f"{table_name}.parquet"
297
+ if self.storage and self.storage.exists(str(delta_path)):
298
+ return self.storage.read(str(delta_path))
299
+ if parquet_path.exists():
300
+ return pd.read_parquet(parquet_path)
301
+ return None
302
+
303
+
304
+ class DatabricksBackend(FeatureStoreBackend):
305
+ """Databricks Feature Engineering backend for production."""
306
+
307
+ def __init__(self, catalog: str = "main", schema: str = "features"):
308
+ self.catalog = catalog
309
+ self.schema = schema
310
+ self._client = None
311
+
312
+ @property
313
+ def client(self):
314
+ """Lazy-load Databricks Feature Engineering client."""
315
+ if self._client is None:
316
+ try:
317
+ from databricks.feature_engineering import FeatureEngineeringClient
318
+ self._client = FeatureEngineeringClient()
319
+ except ImportError:
320
+ raise ImportError(
321
+ "Databricks Feature Engineering is required. "
322
+ "Run on a Databricks cluster."
323
+ )
324
+ return self._client
325
+
326
+ def _full_table_name(self, name: str) -> str:
327
+ """Get fully qualified table name."""
328
+ return f"{self.catalog}.{self.schema}.{name}"
329
+
330
+ def create_feature_table(
331
+ self,
332
+ name: str,
333
+ entity_key: str,
334
+ timestamp_column: str,
335
+ schema: dict[str, str],
336
+ cutoff_date: Optional[datetime] = None,
337
+ ) -> str:
338
+ from pyspark.sql import SparkSession
339
+ from pyspark.sql.types import FloatType, IntegerType, StringType, StructField, StructType, TimestampType
340
+
341
+ spark = SparkSession.builder.getOrCreate()
342
+
343
+ type_mapping = {
344
+ "string": StringType(),
345
+ "float64": FloatType(),
346
+ "float": FloatType(),
347
+ "int64": IntegerType(),
348
+ "int": IntegerType(),
349
+ "datetime": TimestampType(),
350
+ }
351
+
352
+ fields = [StructField(col_name, type_mapping.get(dtype, StringType()), True) for col_name, dtype in schema.items()]
353
+ spark_schema = StructType(fields)
354
+
355
+ empty_df = spark.createDataFrame([], spark_schema)
356
+ full_name = self._full_table_name(name)
357
+
358
+ self.client.create_table(
359
+ name=full_name,
360
+ primary_keys=[entity_key],
361
+ timestamp_keys=[timestamp_column] if timestamp_column else None,
362
+ df=empty_df,
363
+ description=f"Point-in-time cutoff: {cutoff_date.isoformat() if cutoff_date else 'N/A'}",
364
+ )
365
+
366
+ return full_name
367
+
368
+ def write_features(
369
+ self,
370
+ table_name: str,
371
+ df: pd.DataFrame,
372
+ mode: str = "merge",
373
+ cutoff_date: Optional[datetime] = None,
374
+ ) -> None:
375
+ from pyspark.sql import SparkSession
376
+ spark = SparkSession.builder.getOrCreate()
377
+ spark_df = spark.createDataFrame(df)
378
+
379
+ full_name = self._full_table_name(table_name)
380
+ self.client.write_table(name=full_name, df=spark_df, mode=mode)
381
+
382
+ def get_historical_features(
383
+ self,
384
+ entity_df: pd.DataFrame,
385
+ feature_refs: list[str],
386
+ timestamp_column: str = "event_timestamp",
387
+ ) -> pd.DataFrame:
388
+ """Get point-in-time correct historical features."""
389
+ from databricks.feature_engineering import FeatureLookup
390
+ from pyspark.sql import SparkSession
391
+
392
+ spark = SparkSession.builder.getOrCreate()
393
+ entity_spark = spark.createDataFrame(entity_df)
394
+
395
+ lookups = []
396
+ for ref in feature_refs:
397
+ parts = ref.split(":")
398
+ if len(parts) == 2:
399
+ table_name, feature_name = parts
400
+ full_name = self._full_table_name(table_name)
401
+ lookups.append(
402
+ FeatureLookup(
403
+ table_name=full_name,
404
+ feature_names=[feature_name],
405
+ lookup_key=list(entity_df.columns[:1]),
406
+ timestamp_lookup_key=timestamp_column,
407
+ )
408
+ )
409
+
410
+ training_set = self.client.create_training_set(
411
+ df=entity_spark,
412
+ feature_lookups=lookups,
413
+ label=None,
414
+ )
415
+
416
+ return training_set.load_df().toPandas()
417
+
418
+ def get_online_features(
419
+ self,
420
+ entity_keys: dict[str, list[Any]],
421
+ feature_refs: list[str],
422
+ ) -> dict[str, Any]:
423
+ """Get features for online serving via Model Serving."""
424
+ from databricks.feature_engineering import FeatureLookup
425
+ from pyspark.sql import SparkSession
426
+
427
+ spark = SparkSession.builder.getOrCreate()
428
+ entity_df = pd.DataFrame(entity_keys)
429
+ entity_spark = spark.createDataFrame(entity_df)
430
+
431
+ lookups = []
432
+ for ref in feature_refs:
433
+ parts = ref.split(":")
434
+ if len(parts) == 2:
435
+ table_name, _ = parts
436
+ full_name = self._full_table_name(table_name)
437
+ lookups.append(
438
+ FeatureLookup(
439
+ table_name=full_name,
440
+ lookup_key=list(entity_keys.keys()),
441
+ )
442
+ )
443
+
444
+ result = self.client.score_batch(df=entity_spark, feature_lookups=lookups)
445
+ return result.toPandas().to_dict("list")
446
+
447
+ def list_tables(self) -> list[str]:
448
+ """List all feature tables in the schema."""
449
+ tables = self.client.list_tables()
450
+ prefix = f"{self.catalog}.{self.schema}."
451
+ return [
452
+ t.name.replace(prefix, "")
453
+ for t in tables
454
+ if t.name.startswith(prefix)
455
+ ]
456
+
457
+
458
+ class FeatureStoreManager:
459
+ """Unified manager for feature store operations.
460
+
461
+ This class provides a high-level interface for feature store operations
462
+ that works seamlessly with both local (Feast) and production (Databricks)
463
+ backends, while ensuring point-in-time correctness.
464
+
465
+ Example:
466
+ >>> manager = FeatureStoreManager.create(backend="feast")
467
+ >>> manager.publish_features(df, registry, "customer_features")
468
+ >>> training_df = manager.get_training_features(
469
+ ... entity_df, registry, ["tenure_months", "total_spend_30d"]
470
+ ... )
471
+ """
472
+
473
+ def __init__(self, backend: FeatureStoreBackend, output_path: Optional[Path] = None):
474
+ self.backend = backend
475
+ self.output_path = Path(output_path) if output_path else Path("./output")
476
+ self.snapshot_manager = SnapshotManager(self.output_path)
477
+ self.pit_registry = PointInTimeRegistry(self.output_path)
478
+
479
+ @classmethod
480
+ def create(
481
+ cls,
482
+ backend: str = "feast",
483
+ repo_path: str = "./feature_store/feature_repo",
484
+ catalog: str = "main",
485
+ schema: str = "features",
486
+ output_path: Optional[str] = None,
487
+ ) -> "FeatureStoreManager":
488
+ """Factory method to create a manager with the appropriate backend.
489
+
490
+ Args:
491
+ backend: Backend type ("feast" or "databricks")
492
+ repo_path: Path to Feast repo (for feast backend)
493
+ catalog: Unity Catalog name (for databricks backend)
494
+ schema: Schema name (for databricks backend)
495
+ output_path: Path for output files
496
+
497
+ Returns:
498
+ Configured FeatureStoreManager
499
+
500
+ Raises:
501
+ ValueError: If unknown backend specified
502
+ """
503
+ if backend == "feast":
504
+ store_backend = FeastBackend(repo_path=repo_path)
505
+ elif backend == "databricks":
506
+ store_backend = DatabricksBackend(catalog=catalog, schema=schema)
507
+ else:
508
+ raise ValueError(f"Unknown backend: {backend}. Use 'feast' or 'databricks'.")
509
+
510
+ return cls(
511
+ backend=store_backend,
512
+ output_path=Path(output_path) if output_path else None,
513
+ )
514
+
515
+ def publish_features(
516
+ self,
517
+ df: pd.DataFrame,
518
+ registry: FeatureRegistry,
519
+ table_name: str,
520
+ entity_key: str = "entity_id",
521
+ timestamp_column: str = "feature_timestamp",
522
+ mode: str = "merge",
523
+ cutoff_date: Optional[datetime] = None,
524
+ ) -> str:
525
+ effective_cutoff = cutoff_date or self.pit_registry.get_reference_cutoff() or datetime.now()
526
+
527
+ is_valid, message = self.pit_registry.validate_cutoff(effective_cutoff)
528
+ if not is_valid:
529
+ raise ValueError(f"Point-in-time consistency error: {message}")
530
+
531
+ if isinstance(self.backend, FeastBackend):
532
+ backend_valid, backend_msg = self.backend.validate_cutoff_consistency(effective_cutoff)
533
+ if not backend_valid:
534
+ raise ValueError(f"Feature store cutoff mismatch: {backend_msg}")
535
+
536
+ missing_features = [f for f in registry.list_features() if f not in df.columns]
537
+ if missing_features:
538
+ print(f"Warning: Missing features in DataFrame: {missing_features}")
539
+
540
+ schema = {entity_key: "string", timestamp_column: "datetime"}
541
+ for feature_name in registry.list_features():
542
+ if feature_name in df.columns:
543
+ feature = registry.get(feature_name)
544
+ schema[feature_name] = feature.data_type if feature else "float64"
545
+
546
+ self.backend.create_feature_table(
547
+ name=table_name,
548
+ entity_key=entity_key,
549
+ timestamp_column=timestamp_column,
550
+ schema=schema,
551
+ cutoff_date=effective_cutoff,
552
+ )
553
+
554
+ columns_to_write = [entity_key, timestamp_column] + [f for f in registry.list_features() if f in df.columns]
555
+ self.backend.write_features(table_name, df[columns_to_write], mode=mode, cutoff_date=effective_cutoff)
556
+
557
+ return table_name
558
+
559
+ def get_training_features(
560
+ self,
561
+ entity_df: pd.DataFrame,
562
+ registry: FeatureRegistry,
563
+ feature_names: Optional[list[str]] = None,
564
+ table_name: str = "customer_features",
565
+ timestamp_column: str = "event_timestamp",
566
+ ) -> pd.DataFrame:
567
+ """Get point-in-time correct features for training.
568
+
569
+ Args:
570
+ entity_df: DataFrame with entity keys and timestamps
571
+ registry: Feature registry
572
+ feature_names: Specific features to retrieve (all if None)
573
+ table_name: Feature table name
574
+ timestamp_column: Timestamp column in entity_df
575
+
576
+ Returns:
577
+ DataFrame with entity keys, timestamps, and features
578
+ """
579
+ feature_refs = registry.get_feature_refs(
580
+ table_name,
581
+ feature_names or registry.list_features(),
582
+ )
583
+
584
+ return self.backend.get_historical_features(
585
+ entity_df=entity_df,
586
+ feature_refs=feature_refs,
587
+ timestamp_column=timestamp_column,
588
+ )
589
+
590
+ def get_inference_features(
591
+ self,
592
+ entity_df: pd.DataFrame,
593
+ registry: FeatureRegistry,
594
+ feature_names: Optional[list[str]] = None,
595
+ table_name: str = "customer_features",
596
+ timestamp_column: str = "event_timestamp",
597
+ ) -> pd.DataFrame:
598
+ """Get point-in-time correct features for batch inference.
599
+
600
+ This is the recommended method for batch inference as it ensures
601
+ features are retrieved as they existed at the specified inference
602
+ timestamp, preventing future data leakage.
603
+
604
+ Args:
605
+ entity_df: DataFrame with entity keys and inference timestamps
606
+ Must have entity_id column and a timestamp column
607
+ registry: Feature registry
608
+ feature_names: Specific features to retrieve (all if None)
609
+ table_name: Feature table name
610
+ timestamp_column: Name of the timestamp column in entity_df
611
+
612
+ Returns:
613
+ DataFrame with entity keys, timestamps, and features
614
+
615
+ Example:
616
+ >>> # Create entity DataFrame with inference timestamp
617
+ >>> entity_df = pd.DataFrame({
618
+ ... "entity_id": ["cust_1", "cust_2"],
619
+ ... "event_timestamp": [datetime.now(), datetime.now()]
620
+ ... })
621
+ >>> # Get features as of the inference timestamp
622
+ >>> features_df = manager.get_inference_features(
623
+ ... entity_df, registry, timestamp_column="event_timestamp"
624
+ ... )
625
+ """
626
+ feature_refs = registry.get_feature_refs(
627
+ table_name,
628
+ feature_names or registry.list_features(),
629
+ )
630
+
631
+ return self.backend.get_historical_features(
632
+ entity_df=entity_df,
633
+ feature_refs=feature_refs,
634
+ timestamp_column=timestamp_column,
635
+ )
636
+
637
+ def get_online_features(
638
+ self,
639
+ entity_keys: dict[str, list[Any]],
640
+ registry: FeatureRegistry,
641
+ feature_names: Optional[list[str]] = None,
642
+ table_name: str = "customer_features",
643
+ ) -> dict[str, Any]:
644
+ """Get latest features for online/real-time inference.
645
+
646
+ This returns the latest feature values without point-in-time
647
+ correctness. Use for real-time serving where you want the
648
+ most recent features.
649
+
650
+ For batch inference with PIT correctness, use get_inference_features().
651
+
652
+ Args:
653
+ entity_keys: Dictionary of entity key column to values
654
+ registry: Feature registry
655
+ feature_names: Specific features to retrieve (all if None)
656
+ table_name: Feature table name
657
+
658
+ Returns:
659
+ Dictionary of feature values
660
+ """
661
+ feature_refs = registry.get_feature_refs(
662
+ table_name,
663
+ feature_names or registry.list_features(),
664
+ )
665
+
666
+ return self.backend.get_online_features(
667
+ entity_keys=entity_keys,
668
+ feature_refs=feature_refs,
669
+ )
670
+
671
+ def create_training_set_from_snapshot(
672
+ self,
673
+ snapshot_id: str,
674
+ registry: FeatureRegistry,
675
+ target_column: str = "target",
676
+ ) -> tuple[pd.DataFrame, pd.Series]:
677
+ """Create a training set from a snapshot.
678
+
679
+ This loads a versioned snapshot and prepares it for training,
680
+ ensuring only the registered features are used.
681
+
682
+ Args:
683
+ snapshot_id: ID of the snapshot to load
684
+ registry: Feature registry
685
+ target_column: Name of the target column
686
+
687
+ Returns:
688
+ Tuple of (features DataFrame, target Series)
689
+ """
690
+ df, metadata = self.snapshot_manager.load_snapshot(snapshot_id)
691
+
692
+ # Get feature columns that exist in both registry and snapshot
693
+ feature_columns = [
694
+ f for f in registry.list_features()
695
+ if f in df.columns
696
+ ]
697
+
698
+ X = df[feature_columns]
699
+ y = df[target_column] if target_column in df.columns else None
700
+
701
+ return X, y
702
+
703
+ def list_tables(self) -> list[str]:
704
+ """List all feature tables.
705
+
706
+ Returns:
707
+ List of table names
708
+ """
709
+ return self.backend.list_tables()
710
+
711
+
712
+ def get_feature_store_manager(
713
+ backend: Optional[str] = None,
714
+ **kwargs,
715
+ ) -> FeatureStoreManager:
716
+ """Get a feature store manager, auto-detecting environment.
717
+
718
+ Args:
719
+ backend: Explicit backend ("feast" or "databricks"), or None for auto-detect
720
+ **kwargs: Additional arguments for the manager
721
+
722
+ Returns:
723
+ Configured FeatureStoreManager
724
+ """
725
+ if backend is None:
726
+ # Auto-detect environment
727
+ try:
728
+ from customer_retention.core.compat.detection import is_databricks
729
+ if is_databricks():
730
+ backend = "databricks"
731
+ else:
732
+ backend = "feast"
733
+ except ImportError:
734
+ backend = "feast"
735
+
736
+ return FeatureStoreManager.create(backend=backend, **kwargs)
737
+
738
+
739
+ def _get_storage():
740
+ try:
741
+ from customer_retention.integrations.adapters.factory import get_delta
742
+ return get_delta(force_local=True)
743
+ except ImportError:
744
+ return None