churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,298 @@
1
+ import json
2
+ from dataclasses import asdict, dataclass, field
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ import yaml
8
+
9
+ from customer_retention.core.config.column_config import ColumnType
10
+
11
+
12
+ @dataclass
13
+ class SourceSpec:
14
+ name: str
15
+ path: str
16
+ format: str
17
+ options: Dict[str, Any] = field(default_factory=dict)
18
+
19
+ def to_dict(self) -> dict:
20
+ return asdict(self)
21
+
22
+
23
+ @dataclass
24
+ class ColumnSpec:
25
+ name: str
26
+ data_type: str
27
+ semantic_type: str
28
+ nullable: bool = True
29
+ description: str = ""
30
+
31
+ def to_dict(self) -> dict:
32
+ return asdict(self)
33
+
34
+ @classmethod
35
+ def from_column_finding(cls, finding) -> "ColumnSpec":
36
+ type_to_dtype = {
37
+ ColumnType.IDENTIFIER: "string",
38
+ ColumnType.TARGET: "integer",
39
+ ColumnType.BINARY: "integer",
40
+ ColumnType.NUMERIC_CONTINUOUS: "float",
41
+ ColumnType.NUMERIC_DISCRETE: "integer",
42
+ ColumnType.CATEGORICAL_NOMINAL: "string",
43
+ ColumnType.CATEGORICAL_ORDINAL: "string",
44
+ ColumnType.CATEGORICAL_CYCLICAL: "string",
45
+ ColumnType.DATETIME: "timestamp",
46
+ ColumnType.TEXT: "string"
47
+ }
48
+ return cls(
49
+ name=finding.name,
50
+ data_type=type_to_dtype.get(finding.inferred_type, "string"),
51
+ semantic_type=finding.inferred_type.value,
52
+ nullable=finding.universal_metrics.get("null_count", 0) > 0
53
+ )
54
+
55
+
56
+ @dataclass
57
+ class SchemaSpec:
58
+ columns: List[ColumnSpec]
59
+ primary_key: Optional[str] = None
60
+ partition_columns: List[str] = field(default_factory=list)
61
+
62
+ def to_dict(self) -> dict:
63
+ return {
64
+ "columns": [c.to_dict() for c in self.columns],
65
+ "primary_key": self.primary_key,
66
+ "partition_columns": self.partition_columns
67
+ }
68
+
69
+ @classmethod
70
+ def from_findings(cls, findings) -> "SchemaSpec":
71
+ columns = [ColumnSpec.from_column_finding(col) for col in findings.columns.values()]
72
+ primary_key = findings.identifier_columns[0] if findings.identifier_columns else None
73
+ return cls(columns=columns, primary_key=primary_key)
74
+
75
+
76
+ @dataclass
77
+ class TransformSpec:
78
+ name: str
79
+ transform_type: str
80
+ input_columns: List[str]
81
+ output_columns: List[str]
82
+ parameters: Dict[str, Any] = field(default_factory=dict)
83
+
84
+ def to_dict(self) -> dict:
85
+ return asdict(self)
86
+
87
+
88
+ @dataclass
89
+ class FeatureSpec:
90
+ name: str
91
+ source_columns: List[str]
92
+ computation: str
93
+ description: str = ""
94
+ parameters: Dict[str, Any] = field(default_factory=dict)
95
+
96
+ def to_dict(self) -> dict:
97
+ return asdict(self)
98
+
99
+
100
+ @dataclass
101
+ class ModelSpec:
102
+ name: str
103
+ model_type: str
104
+ target_column: str
105
+ feature_columns: List[str]
106
+ hyperparameters: Dict[str, Any] = field(default_factory=dict)
107
+ metrics: List[str] = field(default_factory=lambda: ["auc", "precision", "recall", "f1"])
108
+
109
+ def to_dict(self) -> dict:
110
+ return asdict(self)
111
+
112
+
113
+ @dataclass
114
+ class QualityGateSpec:
115
+ name: str
116
+ gate_type: str
117
+ column: str
118
+ threshold: float
119
+ action: str = "fail"
120
+ parameters: Dict[str, Any] = field(default_factory=dict)
121
+
122
+ def to_dict(self) -> dict:
123
+ return asdict(self)
124
+
125
+
126
+ @dataclass
127
+ class PipelineSpec:
128
+ name: str = "pipeline"
129
+ version: str = "1.0.0"
130
+ description: str = ""
131
+ created_at: str = field(default_factory=lambda: datetime.now().isoformat())
132
+ sources: List[SourceSpec] = field(default_factory=list)
133
+ schema: Optional[SchemaSpec] = None
134
+ bronze_transforms: List[TransformSpec] = field(default_factory=list)
135
+ silver_transforms: List[TransformSpec] = field(default_factory=list)
136
+ gold_transforms: List[TransformSpec] = field(default_factory=list)
137
+ feature_definitions: List[FeatureSpec] = field(default_factory=list)
138
+ model_config: Optional[ModelSpec] = None
139
+ quality_gates: List[QualityGateSpec] = field(default_factory=list)
140
+ metadata: Dict[str, Any] = field(default_factory=dict)
141
+
142
+ @classmethod
143
+ def from_findings(cls, findings, name: str = None) -> "PipelineSpec":
144
+ spec = cls(
145
+ name=name or Path(findings.source_path).stem + "_pipeline",
146
+ description=f"Pipeline generated from {findings.source_path}"
147
+ )
148
+ spec.sources.append(SourceSpec(
149
+ name="primary_source",
150
+ path=findings.source_path,
151
+ format=findings.source_format
152
+ ))
153
+ spec.schema = SchemaSpec.from_findings(findings)
154
+ spec._add_default_transforms(findings)
155
+ spec._add_default_features(findings)
156
+ spec._add_default_model(findings)
157
+ spec._add_default_quality_gates(findings)
158
+ return spec
159
+
160
+ def _add_default_transforms(self, findings):
161
+ for name, col in findings.columns.items():
162
+ if col.inferred_type == ColumnType.IDENTIFIER:
163
+ continue
164
+ if col.inferred_type == ColumnType.TARGET:
165
+ continue
166
+ if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:
167
+ self.silver_transforms.append(TransformSpec(
168
+ name=f"scale_{name}",
169
+ transform_type="standard_scaling",
170
+ input_columns=[name],
171
+ output_columns=[f"{name}_scaled"]
172
+ ))
173
+ elif col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]:
174
+ self.silver_transforms.append(TransformSpec(
175
+ name=f"encode_{name}",
176
+ transform_type="one_hot_encoding",
177
+ input_columns=[name],
178
+ output_columns=[f"{name}_encoded"]
179
+ ))
180
+
181
+ def _add_default_features(self, findings):
182
+ for name in findings.datetime_columns:
183
+ self.feature_definitions.append(FeatureSpec(
184
+ name=f"days_since_{name}",
185
+ source_columns=[name],
186
+ computation="days_since_today",
187
+ description=f"Days since {name}"
188
+ ))
189
+
190
+ def _add_default_model(self, findings):
191
+ if findings.target_column:
192
+ feature_cols = [
193
+ name for name, col in findings.columns.items()
194
+ if col.inferred_type not in [ColumnType.IDENTIFIER, ColumnType.TARGET]
195
+ ]
196
+ self.model_config = ModelSpec(
197
+ name="default_model",
198
+ model_type="gradient_boosting",
199
+ target_column=findings.target_column,
200
+ feature_columns=feature_cols
201
+ )
202
+
203
+ def _add_default_quality_gates(self, findings):
204
+ self.quality_gates.append(QualityGateSpec(
205
+ name="schema_check",
206
+ gate_type="schema_validation",
207
+ column="*",
208
+ threshold=0
209
+ ))
210
+ self.quality_gates.append(QualityGateSpec(
211
+ name="null_check",
212
+ gate_type="null_percentage",
213
+ column="*",
214
+ threshold=50.0,
215
+ action="warn"
216
+ ))
217
+
218
+ def add_transform(self, transform: TransformSpec, stage: str = "silver"):
219
+ if stage == "bronze":
220
+ self.bronze_transforms.append(transform)
221
+ elif stage == "silver":
222
+ self.silver_transforms.append(transform)
223
+ elif stage == "gold":
224
+ self.gold_transforms.append(transform)
225
+
226
+ def add_feature(self, feature: FeatureSpec):
227
+ self.feature_definitions.append(feature)
228
+
229
+ def add_quality_gate(self, gate: QualityGateSpec):
230
+ self.quality_gates.append(gate)
231
+
232
+ def to_dict(self) -> dict:
233
+ return {
234
+ "name": self.name,
235
+ "version": self.version,
236
+ "description": self.description,
237
+ "created_at": self.created_at,
238
+ "sources": [s.to_dict() for s in self.sources],
239
+ "schema": self.schema.to_dict() if self.schema else None,
240
+ "bronze_transforms": [t.to_dict() for t in self.bronze_transforms],
241
+ "silver_transforms": [t.to_dict() for t in self.silver_transforms],
242
+ "gold_transforms": [t.to_dict() for t in self.gold_transforms],
243
+ "feature_definitions": [f.to_dict() for f in self.feature_definitions],
244
+ "model_config": self.model_config.to_dict() if self.model_config else None,
245
+ "quality_gates": [g.to_dict() for g in self.quality_gates],
246
+ "metadata": self.metadata
247
+ }
248
+
249
+ def to_json(self, indent: int = 2) -> str:
250
+ return json.dumps(self.to_dict(), indent=indent)
251
+
252
+ def to_yaml(self) -> str:
253
+ return yaml.dump(self.to_dict(), default_flow_style=False, sort_keys=False)
254
+
255
+ def save(self, path: str):
256
+ content = self.to_yaml() if path.endswith((".yaml", ".yml")) else self.to_json()
257
+ with open(path, "w") as f:
258
+ f.write(content)
259
+
260
+ @classmethod
261
+ def load(cls, path: str) -> "PipelineSpec":
262
+ with open(path, "r") as f:
263
+ content = f.read()
264
+ data = yaml.safe_load(content) if path.endswith((".yaml", ".yml")) else json.loads(content)
265
+ return cls._from_dict(data)
266
+
267
+ @classmethod
268
+ def _from_dict(cls, data: dict) -> "PipelineSpec":
269
+ spec = cls(
270
+ name=data.get("name", "pipeline"),
271
+ version=data.get("version", "1.0.0"),
272
+ description=data.get("description", ""),
273
+ created_at=data.get("created_at", datetime.now().isoformat())
274
+ )
275
+ for src_data in data.get("sources", []):
276
+ spec.sources.append(SourceSpec(**src_data))
277
+ if data.get("schema"):
278
+ schema_data = data["schema"]
279
+ columns = [ColumnSpec(**c) for c in schema_data.get("columns", [])]
280
+ spec.schema = SchemaSpec(
281
+ columns=columns,
282
+ primary_key=schema_data.get("primary_key"),
283
+ partition_columns=schema_data.get("partition_columns", [])
284
+ )
285
+ for t_data in data.get("bronze_transforms", []):
286
+ spec.bronze_transforms.append(TransformSpec(**t_data))
287
+ for t_data in data.get("silver_transforms", []):
288
+ spec.silver_transforms.append(TransformSpec(**t_data))
289
+ for t_data in data.get("gold_transforms", []):
290
+ spec.gold_transforms.append(TransformSpec(**t_data))
291
+ for f_data in data.get("feature_definitions", []):
292
+ spec.feature_definitions.append(FeatureSpec(**f_data))
293
+ if data.get("model_config"):
294
+ spec.model_config = ModelSpec(**data["model_config"])
295
+ for g_data in data.get("quality_gates", []):
296
+ spec.quality_gates.append(QualityGateSpec(**g_data))
297
+ spec.metadata = data.get("metadata", {})
298
+ return spec
File without changes
@@ -0,0 +1,13 @@
1
+ from .base import AdapterResult
2
+ from .factory import get_delta, get_feature_store, get_mlflow
3
+ from .feature_store import DatabricksFeatureStore, FeatureStoreAdapter, LocalFeatureStore
4
+ from .mlflow import DatabricksMLflow, LocalMLflow, MLflowAdapter
5
+ from .storage import DatabricksDelta, DeltaStorage, LocalDelta
6
+
7
+ __all__ = [
8
+ "AdapterResult",
9
+ "DeltaStorage", "LocalDelta", "DatabricksDelta",
10
+ "FeatureStoreAdapter", "LocalFeatureStore", "DatabricksFeatureStore",
11
+ "MLflowAdapter", "LocalMLflow", "DatabricksMLflow",
12
+ "get_delta", "get_feature_store", "get_mlflow",
13
+ ]
@@ -0,0 +1,10 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Any, Dict, Optional
3
+
4
+
5
+ @dataclass
6
+ class AdapterResult:
7
+ success: bool
8
+ data: Optional[Any] = None
9
+ error: Optional[str] = None
10
+ metadata: Dict[str, Any] = field(default_factory=dict)
@@ -0,0 +1,25 @@
1
+ from customer_retention.core.compat.detection import is_spark_available
2
+
3
+ from .feature_store import DatabricksFeatureStore, FeatureStoreAdapter, LocalFeatureStore
4
+ from .mlflow import DatabricksMLflow, LocalMLflow, MLflowAdapter
5
+ from .storage import DatabricksDelta, DeltaStorage, LocalDelta
6
+
7
+
8
+ def get_delta(force_local: bool = False) -> DeltaStorage:
9
+ if force_local or not is_spark_available():
10
+ return LocalDelta()
11
+ return DatabricksDelta()
12
+
13
+
14
+ def get_feature_store(base_path: str = "./feature_store", catalog: str = "main",
15
+ schema: str = "default", force_local: bool = False) -> FeatureStoreAdapter:
16
+ if force_local or not is_spark_available():
17
+ return LocalFeatureStore(base_path=base_path)
18
+ return DatabricksFeatureStore(catalog=catalog, schema=schema)
19
+
20
+
21
+ def get_mlflow(tracking_uri: str = "./mlruns", registry_uri: str = "databricks-uc",
22
+ force_local: bool = False) -> MLflowAdapter:
23
+ if force_local or not is_spark_available():
24
+ return LocalMLflow(tracking_uri=tracking_uri)
25
+ return DatabricksMLflow(registry_uri=registry_uri)
@@ -0,0 +1,6 @@
1
+ from .base import FeatureStoreAdapter, FeatureViewConfig
2
+ from .databricks import DatabricksFeatureStore
3
+ from .feast_adapter import FeastAdapter
4
+ from .local import LocalFeatureStore
5
+
6
+ __all__ = ["FeatureStoreAdapter", "FeatureViewConfig", "LocalFeatureStore", "DatabricksFeatureStore", "FeastAdapter"]
@@ -0,0 +1,57 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass, field
3
+ from datetime import datetime
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ import pandas as pd
7
+
8
+ from ..base import AdapterResult
9
+
10
+
11
+ @dataclass
12
+ class FeatureViewConfig:
13
+ name: str
14
+ entity_key: str
15
+ features: List[str]
16
+ ttl_days: Optional[int] = None
17
+ tags: Dict[str, str] = field(default_factory=dict)
18
+ cutoff_date: Optional[datetime] = None
19
+ data_hash: Optional[str] = None
20
+
21
+
22
+ class FeatureStoreAdapter(ABC):
23
+ @abstractmethod
24
+ def create_table(self, name: str, schema: Dict[str, str], primary_keys: List[str]) -> AdapterResult:
25
+ pass
26
+
27
+ @abstractmethod
28
+ def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
29
+ pass
30
+
31
+ @abstractmethod
32
+ def read_table(self, name: str, version: Optional[int] = None) -> pd.DataFrame:
33
+ pass
34
+
35
+ @abstractmethod
36
+ def get_table_metadata(self, name: str) -> Dict[str, Any]:
37
+ pass
38
+
39
+ @abstractmethod
40
+ def list_tables(self) -> List[str]:
41
+ pass
42
+
43
+ @abstractmethod
44
+ def delete_table(self, name: str) -> AdapterResult:
45
+ pass
46
+
47
+ def register_feature_view(self, config: FeatureViewConfig, df: pd.DataFrame) -> str:
48
+ raise NotImplementedError("Subclass must implement register_feature_view")
49
+
50
+ def get_historical_features(self, entity_df: pd.DataFrame, feature_refs: List[str]) -> pd.DataFrame:
51
+ raise NotImplementedError("Subclass must implement get_historical_features")
52
+
53
+ def materialize(self, feature_views: List[str], start_date: str, end_date: str) -> None:
54
+ raise NotImplementedError("Subclass must implement materialize")
55
+
56
+ def get_online_features(self, entity_keys: Dict[str, List[Any]], feature_refs: List[str]) -> Dict:
57
+ raise NotImplementedError("Subclass must implement get_online_features")
@@ -0,0 +1,94 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ import pandas as pd
4
+
5
+ from customer_retention.core.compat.detection import get_spark_session, is_spark_available
6
+
7
+ from ..base import AdapterResult
8
+ from .base import FeatureStoreAdapter, FeatureViewConfig
9
+
10
+
11
+ class DatabricksFeatureStore(FeatureStoreAdapter):
12
+ def __init__(self, catalog: str = "main", schema: str = "default"):
13
+ if not is_spark_available():
14
+ raise ImportError("PySpark required for DatabricksFeatureStore")
15
+ self.catalog = catalog
16
+ self.schema = schema
17
+ self._fe_client = None
18
+
19
+ @property
20
+ def fe_client(self) -> Any:
21
+ if self._fe_client is None:
22
+ from databricks.feature_engineering import FeatureEngineeringClient
23
+ self._fe_client = FeatureEngineeringClient()
24
+ return self._fe_client
25
+
26
+ def _full_name(self, name: str) -> str:
27
+ return f"{self.catalog}.{self.schema}.{name}"
28
+
29
+ def create_table(self, name: str, schema: Dict[str, str], primary_keys: List[str]) -> AdapterResult:
30
+ full_name = self._full_name(name)
31
+ spark = get_spark_session()
32
+ df = spark.createDataFrame([], self._schema_to_spark(schema))
33
+ self.fe_client.create_table(name=full_name, primary_keys=primary_keys, df=df)
34
+ return AdapterResult(success=True, metadata={"name": full_name})
35
+
36
+ def _schema_to_spark(self, schema: Dict[str, str]) -> Any:
37
+ from pyspark.sql.types import FloatType, IntegerType, StringType, StructField, StructType
38
+ type_map = {"int": IntegerType(), "float": FloatType(), "string": StringType()}
39
+ fields = [StructField(name, type_map.get(dtype, StringType()), True) for name, dtype in schema.items()]
40
+ return StructType(fields)
41
+
42
+ def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
43
+ full_name = self._full_name(name)
44
+ spark = get_spark_session()
45
+ spark_df = spark.createDataFrame(df)
46
+ self.fe_client.write_table(name=full_name, df=spark_df, mode=mode)
47
+ return AdapterResult(success=True)
48
+
49
+ def read_table(self, name: str, version: Optional[int] = None) -> pd.DataFrame:
50
+ full_name = self._full_name(name)
51
+ spark = get_spark_session()
52
+ reader = spark.read.format("delta").table(full_name)
53
+ if version is not None:
54
+ reader = spark.read.format("delta").option("versionAsOf", version).table(full_name)
55
+ return reader.toPandas()
56
+
57
+ def get_table_metadata(self, name: str) -> Dict[str, Any]:
58
+ full_name = self._full_name(name)
59
+ table_info = self.fe_client.get_table(full_name)
60
+ return {"name": full_name, "primary_keys": table_info.primary_keys, "features": table_info.features}
61
+
62
+ def list_tables(self) -> List[str]:
63
+ tables = self.fe_client.list_tables()
64
+ return [t.name for t in tables if t.name.startswith(f"{self.catalog}.{self.schema}")]
65
+
66
+ def delete_table(self, name: str) -> AdapterResult:
67
+ full_name = self._full_name(name)
68
+ self.fe_client.drop_table(full_name)
69
+ return AdapterResult(success=True)
70
+
71
+ def register_feature_view(self, config: FeatureViewConfig, df: pd.DataFrame) -> str:
72
+ table_name = self._full_name(config.name)
73
+ spark = get_spark_session()
74
+ spark_df = spark.createDataFrame(df)
75
+ self.fe_client.create_table(name=table_name, primary_keys=[config.entity_key], df=spark_df)
76
+ return table_name
77
+
78
+ def get_historical_features(self, entity_df: pd.DataFrame, feature_refs: List[str]) -> pd.DataFrame:
79
+ from databricks.feature_engineering import FeatureLookup
80
+ spark = get_spark_session()
81
+ lookups = [FeatureLookup(table_name=ref.split(":")[0], lookup_key=[entity_df.columns[0]]) for ref in feature_refs]
82
+ training_set = self.fe_client.create_training_set(df=spark.createDataFrame(entity_df), feature_lookups=lookups, label=None)
83
+ return training_set.load_df().toPandas()
84
+
85
+ def materialize(self, feature_views: List[str], start_date: str, end_date: str) -> None:
86
+ pass
87
+
88
+ def get_online_features(self, entity_keys: Dict[str, List[Any]], feature_refs: List[str]) -> Dict:
89
+ entity_df = pd.DataFrame(entity_keys)
90
+ spark = get_spark_session()
91
+ from databricks.feature_engineering import FeatureLookup
92
+ lookups = [FeatureLookup(table_name=ref.split(":")[0], lookup_key=list(entity_keys.keys())) for ref in feature_refs]
93
+ result = self.fe_client.score_batch(df=spark.createDataFrame(entity_df), feature_lookups=lookups)
94
+ return result.toPandas().to_dict()
@@ -0,0 +1,97 @@
1
+ from datetime import datetime
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import pandas as pd
6
+
7
+ from ..base import AdapterResult
8
+ from .base import FeatureStoreAdapter, FeatureViewConfig
9
+
10
+
11
+ class FeastAdapter(FeatureStoreAdapter):
12
+ def __init__(self, repo_path: str = "./feature_store/feature_repo"):
13
+ self._repo_path = repo_path
14
+ self._store = None
15
+ self._feature_views: Dict[str, FeatureViewConfig] = {}
16
+ self._data_sources: Dict[str, pd.DataFrame] = {}
17
+ self.storage = _get_storage()
18
+
19
+ @property
20
+ def store(self):
21
+ if self._store is None:
22
+ from feast import FeatureStore
23
+ self._store = FeatureStore(repo_path=self._repo_path)
24
+ return self._store
25
+
26
+ def register_feature_view(self, config: FeatureViewConfig, df: pd.DataFrame) -> str:
27
+ self._feature_views[config.name] = config
28
+ self._data_sources[config.name] = df
29
+ data_dir = Path(self._repo_path) / "data"
30
+ data_dir.mkdir(parents=True, exist_ok=True)
31
+ if self.storage:
32
+ self.storage.write(df, str(data_dir / config.name))
33
+ else:
34
+ df.to_parquet(data_dir / f"{config.name}.parquet", index=False)
35
+ return config.name
36
+
37
+ def get_historical_features(self, entity_df: pd.DataFrame, feature_refs: List[str]) -> pd.DataFrame:
38
+ return self.store.get_historical_features(entity_df=entity_df, features=feature_refs).to_df()
39
+
40
+ def materialize(self, feature_views: List[str], start_date: str, end_date: str) -> None:
41
+ self.store.materialize(
42
+ start_date=datetime.fromisoformat(start_date),
43
+ end_date=datetime.fromisoformat(end_date),
44
+ feature_views=feature_views
45
+ )
46
+
47
+ def get_online_features(self, entity_keys: Dict[str, List[Any]], feature_refs: List[str]) -> Dict:
48
+ entity_rows = [{k: v[i] for k, v in entity_keys.items()} for i in range(len(next(iter(entity_keys.values()))))]
49
+ return self.store.get_online_features(features=feature_refs, entity_rows=entity_rows).to_dict()
50
+
51
+ def create_table(self, name: str, schema: Dict[str, str], primary_keys: List[str]) -> AdapterResult:
52
+ config = FeatureViewConfig(name=name, entity_key=primary_keys[0], features=list(schema.keys()))
53
+ self._feature_views[name] = config
54
+ return AdapterResult(success=True, metadata={"name": name})
55
+
56
+ def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
57
+ if name not in self._feature_views:
58
+ return AdapterResult(success=False, error=f"Feature view {name} not found")
59
+ config = self._feature_views[name]
60
+ self.register_feature_view(config, df)
61
+ return AdapterResult(success=True)
62
+
63
+ def read_table(self, name: str, version: Optional[int] = None) -> pd.DataFrame:
64
+ if name not in self._data_sources:
65
+ delta_path = Path(self._repo_path) / "data" / name
66
+ parquet_path = Path(self._repo_path) / "data" / f"{name}.parquet"
67
+ if self.storage and delta_path.is_dir() and self.storage.exists(str(delta_path)):
68
+ return self.storage.read(str(delta_path), version=version)
69
+ if parquet_path.exists():
70
+ return pd.read_parquet(parquet_path)
71
+ raise KeyError(f"Feature view {name} not found")
72
+ return self._data_sources[name]
73
+
74
+ def get_table_metadata(self, name: str) -> Dict[str, Any]:
75
+ if name not in self._feature_views:
76
+ raise KeyError(f"Feature view {name} not found")
77
+ config = self._feature_views[name]
78
+ return {"name": config.name, "entity_key": config.entity_key, "features": config.features, "ttl_days": config.ttl_days}
79
+
80
+ def list_tables(self) -> List[str]:
81
+ return list(self._feature_views.keys())
82
+
83
+ def delete_table(self, name: str) -> AdapterResult:
84
+ if name not in self._feature_views:
85
+ return AdapterResult(success=False, error=f"Feature view {name} not found")
86
+ del self._feature_views[name]
87
+ if name in self._data_sources:
88
+ del self._data_sources[name]
89
+ return AdapterResult(success=True)
90
+
91
+
92
+ def _get_storage():
93
+ try:
94
+ from customer_retention.integrations.adapters.factory import get_delta
95
+ return get_delta(force_local=True)
96
+ except ImportError:
97
+ return None
@@ -0,0 +1,75 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import pandas as pd
6
+
7
+ from ..base import AdapterResult
8
+ from ..storage import LocalDelta
9
+ from .base import FeatureStoreAdapter
10
+
11
+
12
+ class LocalFeatureStore(FeatureStoreAdapter):
13
+ def __init__(self, base_path: str = "./feature_store"):
14
+ self.base_path = Path(base_path)
15
+ self.base_path.mkdir(parents=True, exist_ok=True)
16
+ self.registry_path = self.base_path / "registry.json"
17
+ self.storage = LocalDelta()
18
+ self._load_registry()
19
+
20
+ def _load_registry(self) -> None:
21
+ if self.registry_path.exists():
22
+ with open(self.registry_path) as f:
23
+ self._registry = json.load(f)
24
+ else:
25
+ self._registry = {"tables": {}}
26
+
27
+ def _save_registry(self) -> None:
28
+ with open(self.registry_path, "w") as f:
29
+ json.dump(self._registry, f, indent=2)
30
+
31
+ def _table_path(self, name: str) -> str:
32
+ return str(self.base_path / "tables" / name)
33
+
34
+ def create_table(self, name: str, schema: Dict[str, str], primary_keys: List[str]) -> AdapterResult:
35
+ self._registry["tables"][name] = {
36
+ "schema": schema,
37
+ "primary_keys": primary_keys,
38
+ "path": self._table_path(name)
39
+ }
40
+ self._save_registry()
41
+ return AdapterResult(success=True, metadata={"name": name})
42
+
43
+ def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
44
+ if name not in self._registry["tables"]:
45
+ return AdapterResult(success=False, error=f"Table {name} not found")
46
+ table_info = self._registry["tables"][name]
47
+ path = table_info["path"]
48
+ if mode == "merge" and Path(path).exists():
49
+ primary_keys = table_info["primary_keys"]
50
+ condition = " AND ".join([f"source.{k} = target.{k}" for k in primary_keys])
51
+ self.storage.merge(df, path, condition)
52
+ else:
53
+ self.storage.write(df, path, mode="overwrite" if mode == "merge" else mode)
54
+ return AdapterResult(success=True)
55
+
56
+ def read_table(self, name: str, version: Optional[int] = None) -> pd.DataFrame:
57
+ if name not in self._registry["tables"]:
58
+ raise KeyError(f"Table {name} not found")
59
+ path = self._registry["tables"][name]["path"]
60
+ return self.storage.read(path, version=version)
61
+
62
+ def get_table_metadata(self, name: str) -> Dict[str, Any]:
63
+ if name not in self._registry["tables"]:
64
+ raise KeyError(f"Table {name} not found")
65
+ return self._registry["tables"][name]
66
+
67
+ def list_tables(self) -> List[str]:
68
+ return list(self._registry["tables"].keys())
69
+
70
+ def delete_table(self, name: str) -> AdapterResult:
71
+ if name not in self._registry["tables"]:
72
+ return AdapterResult(success=False, error=f"Table {name} not found")
73
+ del self._registry["tables"][name]
74
+ self._save_registry()
75
+ return AdapterResult(success=True)