churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import asdict, dataclass, field
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from customer_retention.core.config.column_config import ColumnType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class SourceSpec:
|
|
14
|
+
name: str
|
|
15
|
+
path: str
|
|
16
|
+
format: str
|
|
17
|
+
options: Dict[str, Any] = field(default_factory=dict)
|
|
18
|
+
|
|
19
|
+
def to_dict(self) -> dict:
|
|
20
|
+
return asdict(self)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ColumnSpec:
|
|
25
|
+
name: str
|
|
26
|
+
data_type: str
|
|
27
|
+
semantic_type: str
|
|
28
|
+
nullable: bool = True
|
|
29
|
+
description: str = ""
|
|
30
|
+
|
|
31
|
+
def to_dict(self) -> dict:
|
|
32
|
+
return asdict(self)
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def from_column_finding(cls, finding) -> "ColumnSpec":
|
|
36
|
+
type_to_dtype = {
|
|
37
|
+
ColumnType.IDENTIFIER: "string",
|
|
38
|
+
ColumnType.TARGET: "integer",
|
|
39
|
+
ColumnType.BINARY: "integer",
|
|
40
|
+
ColumnType.NUMERIC_CONTINUOUS: "float",
|
|
41
|
+
ColumnType.NUMERIC_DISCRETE: "integer",
|
|
42
|
+
ColumnType.CATEGORICAL_NOMINAL: "string",
|
|
43
|
+
ColumnType.CATEGORICAL_ORDINAL: "string",
|
|
44
|
+
ColumnType.CATEGORICAL_CYCLICAL: "string",
|
|
45
|
+
ColumnType.DATETIME: "timestamp",
|
|
46
|
+
ColumnType.TEXT: "string"
|
|
47
|
+
}
|
|
48
|
+
return cls(
|
|
49
|
+
name=finding.name,
|
|
50
|
+
data_type=type_to_dtype.get(finding.inferred_type, "string"),
|
|
51
|
+
semantic_type=finding.inferred_type.value,
|
|
52
|
+
nullable=finding.universal_metrics.get("null_count", 0) > 0
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class SchemaSpec:
|
|
58
|
+
columns: List[ColumnSpec]
|
|
59
|
+
primary_key: Optional[str] = None
|
|
60
|
+
partition_columns: List[str] = field(default_factory=list)
|
|
61
|
+
|
|
62
|
+
def to_dict(self) -> dict:
|
|
63
|
+
return {
|
|
64
|
+
"columns": [c.to_dict() for c in self.columns],
|
|
65
|
+
"primary_key": self.primary_key,
|
|
66
|
+
"partition_columns": self.partition_columns
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def from_findings(cls, findings) -> "SchemaSpec":
|
|
71
|
+
columns = [ColumnSpec.from_column_finding(col) for col in findings.columns.values()]
|
|
72
|
+
primary_key = findings.identifier_columns[0] if findings.identifier_columns else None
|
|
73
|
+
return cls(columns=columns, primary_key=primary_key)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class TransformSpec:
|
|
78
|
+
name: str
|
|
79
|
+
transform_type: str
|
|
80
|
+
input_columns: List[str]
|
|
81
|
+
output_columns: List[str]
|
|
82
|
+
parameters: Dict[str, Any] = field(default_factory=dict)
|
|
83
|
+
|
|
84
|
+
def to_dict(self) -> dict:
|
|
85
|
+
return asdict(self)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class FeatureSpec:
|
|
90
|
+
name: str
|
|
91
|
+
source_columns: List[str]
|
|
92
|
+
computation: str
|
|
93
|
+
description: str = ""
|
|
94
|
+
parameters: Dict[str, Any] = field(default_factory=dict)
|
|
95
|
+
|
|
96
|
+
def to_dict(self) -> dict:
|
|
97
|
+
return asdict(self)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class ModelSpec:
|
|
102
|
+
name: str
|
|
103
|
+
model_type: str
|
|
104
|
+
target_column: str
|
|
105
|
+
feature_columns: List[str]
|
|
106
|
+
hyperparameters: Dict[str, Any] = field(default_factory=dict)
|
|
107
|
+
metrics: List[str] = field(default_factory=lambda: ["auc", "precision", "recall", "f1"])
|
|
108
|
+
|
|
109
|
+
def to_dict(self) -> dict:
|
|
110
|
+
return asdict(self)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dataclass
|
|
114
|
+
class QualityGateSpec:
|
|
115
|
+
name: str
|
|
116
|
+
gate_type: str
|
|
117
|
+
column: str
|
|
118
|
+
threshold: float
|
|
119
|
+
action: str = "fail"
|
|
120
|
+
parameters: Dict[str, Any] = field(default_factory=dict)
|
|
121
|
+
|
|
122
|
+
def to_dict(self) -> dict:
|
|
123
|
+
return asdict(self)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclass
|
|
127
|
+
class PipelineSpec:
|
|
128
|
+
name: str = "pipeline"
|
|
129
|
+
version: str = "1.0.0"
|
|
130
|
+
description: str = ""
|
|
131
|
+
created_at: str = field(default_factory=lambda: datetime.now().isoformat())
|
|
132
|
+
sources: List[SourceSpec] = field(default_factory=list)
|
|
133
|
+
schema: Optional[SchemaSpec] = None
|
|
134
|
+
bronze_transforms: List[TransformSpec] = field(default_factory=list)
|
|
135
|
+
silver_transforms: List[TransformSpec] = field(default_factory=list)
|
|
136
|
+
gold_transforms: List[TransformSpec] = field(default_factory=list)
|
|
137
|
+
feature_definitions: List[FeatureSpec] = field(default_factory=list)
|
|
138
|
+
model_config: Optional[ModelSpec] = None
|
|
139
|
+
quality_gates: List[QualityGateSpec] = field(default_factory=list)
|
|
140
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
141
|
+
|
|
142
|
+
@classmethod
|
|
143
|
+
def from_findings(cls, findings, name: str = None) -> "PipelineSpec":
|
|
144
|
+
spec = cls(
|
|
145
|
+
name=name or Path(findings.source_path).stem + "_pipeline",
|
|
146
|
+
description=f"Pipeline generated from {findings.source_path}"
|
|
147
|
+
)
|
|
148
|
+
spec.sources.append(SourceSpec(
|
|
149
|
+
name="primary_source",
|
|
150
|
+
path=findings.source_path,
|
|
151
|
+
format=findings.source_format
|
|
152
|
+
))
|
|
153
|
+
spec.schema = SchemaSpec.from_findings(findings)
|
|
154
|
+
spec._add_default_transforms(findings)
|
|
155
|
+
spec._add_default_features(findings)
|
|
156
|
+
spec._add_default_model(findings)
|
|
157
|
+
spec._add_default_quality_gates(findings)
|
|
158
|
+
return spec
|
|
159
|
+
|
|
160
|
+
def _add_default_transforms(self, findings):
|
|
161
|
+
for name, col in findings.columns.items():
|
|
162
|
+
if col.inferred_type == ColumnType.IDENTIFIER:
|
|
163
|
+
continue
|
|
164
|
+
if col.inferred_type == ColumnType.TARGET:
|
|
165
|
+
continue
|
|
166
|
+
if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:
|
|
167
|
+
self.silver_transforms.append(TransformSpec(
|
|
168
|
+
name=f"scale_{name}",
|
|
169
|
+
transform_type="standard_scaling",
|
|
170
|
+
input_columns=[name],
|
|
171
|
+
output_columns=[f"{name}_scaled"]
|
|
172
|
+
))
|
|
173
|
+
elif col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]:
|
|
174
|
+
self.silver_transforms.append(TransformSpec(
|
|
175
|
+
name=f"encode_{name}",
|
|
176
|
+
transform_type="one_hot_encoding",
|
|
177
|
+
input_columns=[name],
|
|
178
|
+
output_columns=[f"{name}_encoded"]
|
|
179
|
+
))
|
|
180
|
+
|
|
181
|
+
def _add_default_features(self, findings):
|
|
182
|
+
for name in findings.datetime_columns:
|
|
183
|
+
self.feature_definitions.append(FeatureSpec(
|
|
184
|
+
name=f"days_since_{name}",
|
|
185
|
+
source_columns=[name],
|
|
186
|
+
computation="days_since_today",
|
|
187
|
+
description=f"Days since {name}"
|
|
188
|
+
))
|
|
189
|
+
|
|
190
|
+
def _add_default_model(self, findings):
|
|
191
|
+
if findings.target_column:
|
|
192
|
+
feature_cols = [
|
|
193
|
+
name for name, col in findings.columns.items()
|
|
194
|
+
if col.inferred_type not in [ColumnType.IDENTIFIER, ColumnType.TARGET]
|
|
195
|
+
]
|
|
196
|
+
self.model_config = ModelSpec(
|
|
197
|
+
name="default_model",
|
|
198
|
+
model_type="gradient_boosting",
|
|
199
|
+
target_column=findings.target_column,
|
|
200
|
+
feature_columns=feature_cols
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
def _add_default_quality_gates(self, findings):
|
|
204
|
+
self.quality_gates.append(QualityGateSpec(
|
|
205
|
+
name="schema_check",
|
|
206
|
+
gate_type="schema_validation",
|
|
207
|
+
column="*",
|
|
208
|
+
threshold=0
|
|
209
|
+
))
|
|
210
|
+
self.quality_gates.append(QualityGateSpec(
|
|
211
|
+
name="null_check",
|
|
212
|
+
gate_type="null_percentage",
|
|
213
|
+
column="*",
|
|
214
|
+
threshold=50.0,
|
|
215
|
+
action="warn"
|
|
216
|
+
))
|
|
217
|
+
|
|
218
|
+
def add_transform(self, transform: TransformSpec, stage: str = "silver"):
|
|
219
|
+
if stage == "bronze":
|
|
220
|
+
self.bronze_transforms.append(transform)
|
|
221
|
+
elif stage == "silver":
|
|
222
|
+
self.silver_transforms.append(transform)
|
|
223
|
+
elif stage == "gold":
|
|
224
|
+
self.gold_transforms.append(transform)
|
|
225
|
+
|
|
226
|
+
def add_feature(self, feature: FeatureSpec):
|
|
227
|
+
self.feature_definitions.append(feature)
|
|
228
|
+
|
|
229
|
+
def add_quality_gate(self, gate: QualityGateSpec):
|
|
230
|
+
self.quality_gates.append(gate)
|
|
231
|
+
|
|
232
|
+
def to_dict(self) -> dict:
|
|
233
|
+
return {
|
|
234
|
+
"name": self.name,
|
|
235
|
+
"version": self.version,
|
|
236
|
+
"description": self.description,
|
|
237
|
+
"created_at": self.created_at,
|
|
238
|
+
"sources": [s.to_dict() for s in self.sources],
|
|
239
|
+
"schema": self.schema.to_dict() if self.schema else None,
|
|
240
|
+
"bronze_transforms": [t.to_dict() for t in self.bronze_transforms],
|
|
241
|
+
"silver_transforms": [t.to_dict() for t in self.silver_transforms],
|
|
242
|
+
"gold_transforms": [t.to_dict() for t in self.gold_transforms],
|
|
243
|
+
"feature_definitions": [f.to_dict() for f in self.feature_definitions],
|
|
244
|
+
"model_config": self.model_config.to_dict() if self.model_config else None,
|
|
245
|
+
"quality_gates": [g.to_dict() for g in self.quality_gates],
|
|
246
|
+
"metadata": self.metadata
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
def to_json(self, indent: int = 2) -> str:
|
|
250
|
+
return json.dumps(self.to_dict(), indent=indent)
|
|
251
|
+
|
|
252
|
+
def to_yaml(self) -> str:
|
|
253
|
+
return yaml.dump(self.to_dict(), default_flow_style=False, sort_keys=False)
|
|
254
|
+
|
|
255
|
+
def save(self, path: str):
|
|
256
|
+
content = self.to_yaml() if path.endswith((".yaml", ".yml")) else self.to_json()
|
|
257
|
+
with open(path, "w") as f:
|
|
258
|
+
f.write(content)
|
|
259
|
+
|
|
260
|
+
@classmethod
|
|
261
|
+
def load(cls, path: str) -> "PipelineSpec":
|
|
262
|
+
with open(path, "r") as f:
|
|
263
|
+
content = f.read()
|
|
264
|
+
data = yaml.safe_load(content) if path.endswith((".yaml", ".yml")) else json.loads(content)
|
|
265
|
+
return cls._from_dict(data)
|
|
266
|
+
|
|
267
|
+
@classmethod
|
|
268
|
+
def _from_dict(cls, data: dict) -> "PipelineSpec":
|
|
269
|
+
spec = cls(
|
|
270
|
+
name=data.get("name", "pipeline"),
|
|
271
|
+
version=data.get("version", "1.0.0"),
|
|
272
|
+
description=data.get("description", ""),
|
|
273
|
+
created_at=data.get("created_at", datetime.now().isoformat())
|
|
274
|
+
)
|
|
275
|
+
for src_data in data.get("sources", []):
|
|
276
|
+
spec.sources.append(SourceSpec(**src_data))
|
|
277
|
+
if data.get("schema"):
|
|
278
|
+
schema_data = data["schema"]
|
|
279
|
+
columns = [ColumnSpec(**c) for c in schema_data.get("columns", [])]
|
|
280
|
+
spec.schema = SchemaSpec(
|
|
281
|
+
columns=columns,
|
|
282
|
+
primary_key=schema_data.get("primary_key"),
|
|
283
|
+
partition_columns=schema_data.get("partition_columns", [])
|
|
284
|
+
)
|
|
285
|
+
for t_data in data.get("bronze_transforms", []):
|
|
286
|
+
spec.bronze_transforms.append(TransformSpec(**t_data))
|
|
287
|
+
for t_data in data.get("silver_transforms", []):
|
|
288
|
+
spec.silver_transforms.append(TransformSpec(**t_data))
|
|
289
|
+
for t_data in data.get("gold_transforms", []):
|
|
290
|
+
spec.gold_transforms.append(TransformSpec(**t_data))
|
|
291
|
+
for f_data in data.get("feature_definitions", []):
|
|
292
|
+
spec.feature_definitions.append(FeatureSpec(**f_data))
|
|
293
|
+
if data.get("model_config"):
|
|
294
|
+
spec.model_config = ModelSpec(**data["model_config"])
|
|
295
|
+
for g_data in data.get("quality_gates", []):
|
|
296
|
+
spec.quality_gates.append(QualityGateSpec(**g_data))
|
|
297
|
+
spec.metadata = data.get("metadata", {})
|
|
298
|
+
return spec
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .base import AdapterResult
|
|
2
|
+
from .factory import get_delta, get_feature_store, get_mlflow
|
|
3
|
+
from .feature_store import DatabricksFeatureStore, FeatureStoreAdapter, LocalFeatureStore
|
|
4
|
+
from .mlflow import DatabricksMLflow, LocalMLflow, MLflowAdapter
|
|
5
|
+
from .storage import DatabricksDelta, DeltaStorage, LocalDelta
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"AdapterResult",
|
|
9
|
+
"DeltaStorage", "LocalDelta", "DatabricksDelta",
|
|
10
|
+
"FeatureStoreAdapter", "LocalFeatureStore", "DatabricksFeatureStore",
|
|
11
|
+
"MLflowAdapter", "LocalMLflow", "DatabricksMLflow",
|
|
12
|
+
"get_delta", "get_feature_store", "get_mlflow",
|
|
13
|
+
]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from customer_retention.core.compat.detection import is_spark_available
|
|
2
|
+
|
|
3
|
+
from .feature_store import DatabricksFeatureStore, FeatureStoreAdapter, LocalFeatureStore
|
|
4
|
+
from .mlflow import DatabricksMLflow, LocalMLflow, MLflowAdapter
|
|
5
|
+
from .storage import DatabricksDelta, DeltaStorage, LocalDelta
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_delta(force_local: bool = False) -> DeltaStorage:
|
|
9
|
+
if force_local or not is_spark_available():
|
|
10
|
+
return LocalDelta()
|
|
11
|
+
return DatabricksDelta()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_feature_store(base_path: str = "./feature_store", catalog: str = "main",
|
|
15
|
+
schema: str = "default", force_local: bool = False) -> FeatureStoreAdapter:
|
|
16
|
+
if force_local or not is_spark_available():
|
|
17
|
+
return LocalFeatureStore(base_path=base_path)
|
|
18
|
+
return DatabricksFeatureStore(catalog=catalog, schema=schema)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_mlflow(tracking_uri: str = "./mlruns", registry_uri: str = "databricks-uc",
|
|
22
|
+
force_local: bool = False) -> MLflowAdapter:
|
|
23
|
+
if force_local or not is_spark_available():
|
|
24
|
+
return LocalMLflow(tracking_uri=tracking_uri)
|
|
25
|
+
return DatabricksMLflow(registry_uri=registry_uri)
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from .base import FeatureStoreAdapter, FeatureViewConfig
|
|
2
|
+
from .databricks import DatabricksFeatureStore
|
|
3
|
+
from .feast_adapter import FeastAdapter
|
|
4
|
+
from .local import LocalFeatureStore
|
|
5
|
+
|
|
6
|
+
__all__ = ["FeatureStoreAdapter", "FeatureViewConfig", "LocalFeatureStore", "DatabricksFeatureStore", "FeastAdapter"]
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from ..base import AdapterResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class FeatureViewConfig:
|
|
13
|
+
name: str
|
|
14
|
+
entity_key: str
|
|
15
|
+
features: List[str]
|
|
16
|
+
ttl_days: Optional[int] = None
|
|
17
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
|
18
|
+
cutoff_date: Optional[datetime] = None
|
|
19
|
+
data_hash: Optional[str] = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FeatureStoreAdapter(ABC):
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def create_table(self, name: str, schema: Dict[str, str], primary_keys: List[str]) -> AdapterResult:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def read_table(self, name: str, version: Optional[int] = None) -> pd.DataFrame:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def get_table_metadata(self, name: str) -> Dict[str, Any]:
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def list_tables(self) -> List[str]:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def delete_table(self, name: str) -> AdapterResult:
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
def register_feature_view(self, config: FeatureViewConfig, df: pd.DataFrame) -> str:
|
|
48
|
+
raise NotImplementedError("Subclass must implement register_feature_view")
|
|
49
|
+
|
|
50
|
+
def get_historical_features(self, entity_df: pd.DataFrame, feature_refs: List[str]) -> pd.DataFrame:
|
|
51
|
+
raise NotImplementedError("Subclass must implement get_historical_features")
|
|
52
|
+
|
|
53
|
+
def materialize(self, feature_views: List[str], start_date: str, end_date: str) -> None:
|
|
54
|
+
raise NotImplementedError("Subclass must implement materialize")
|
|
55
|
+
|
|
56
|
+
def get_online_features(self, entity_keys: Dict[str, List[Any]], feature_refs: List[str]) -> Dict:
|
|
57
|
+
raise NotImplementedError("Subclass must implement get_online_features")
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from customer_retention.core.compat.detection import get_spark_session, is_spark_available
|
|
6
|
+
|
|
7
|
+
from ..base import AdapterResult
|
|
8
|
+
from .base import FeatureStoreAdapter, FeatureViewConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DatabricksFeatureStore(FeatureStoreAdapter):
|
|
12
|
+
def __init__(self, catalog: str = "main", schema: str = "default"):
|
|
13
|
+
if not is_spark_available():
|
|
14
|
+
raise ImportError("PySpark required for DatabricksFeatureStore")
|
|
15
|
+
self.catalog = catalog
|
|
16
|
+
self.schema = schema
|
|
17
|
+
self._fe_client = None
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def fe_client(self) -> Any:
|
|
21
|
+
if self._fe_client is None:
|
|
22
|
+
from databricks.feature_engineering import FeatureEngineeringClient
|
|
23
|
+
self._fe_client = FeatureEngineeringClient()
|
|
24
|
+
return self._fe_client
|
|
25
|
+
|
|
26
|
+
def _full_name(self, name: str) -> str:
|
|
27
|
+
return f"{self.catalog}.{self.schema}.{name}"
|
|
28
|
+
|
|
29
|
+
def create_table(self, name: str, schema: Dict[str, str], primary_keys: List[str]) -> AdapterResult:
|
|
30
|
+
full_name = self._full_name(name)
|
|
31
|
+
spark = get_spark_session()
|
|
32
|
+
df = spark.createDataFrame([], self._schema_to_spark(schema))
|
|
33
|
+
self.fe_client.create_table(name=full_name, primary_keys=primary_keys, df=df)
|
|
34
|
+
return AdapterResult(success=True, metadata={"name": full_name})
|
|
35
|
+
|
|
36
|
+
def _schema_to_spark(self, schema: Dict[str, str]) -> Any:
|
|
37
|
+
from pyspark.sql.types import FloatType, IntegerType, StringType, StructField, StructType
|
|
38
|
+
type_map = {"int": IntegerType(), "float": FloatType(), "string": StringType()}
|
|
39
|
+
fields = [StructField(name, type_map.get(dtype, StringType()), True) for name, dtype in schema.items()]
|
|
40
|
+
return StructType(fields)
|
|
41
|
+
|
|
42
|
+
def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
|
|
43
|
+
full_name = self._full_name(name)
|
|
44
|
+
spark = get_spark_session()
|
|
45
|
+
spark_df = spark.createDataFrame(df)
|
|
46
|
+
self.fe_client.write_table(name=full_name, df=spark_df, mode=mode)
|
|
47
|
+
return AdapterResult(success=True)
|
|
48
|
+
|
|
49
|
+
def read_table(self, name: str, version: Optional[int] = None) -> pd.DataFrame:
|
|
50
|
+
full_name = self._full_name(name)
|
|
51
|
+
spark = get_spark_session()
|
|
52
|
+
reader = spark.read.format("delta").table(full_name)
|
|
53
|
+
if version is not None:
|
|
54
|
+
reader = spark.read.format("delta").option("versionAsOf", version).table(full_name)
|
|
55
|
+
return reader.toPandas()
|
|
56
|
+
|
|
57
|
+
def get_table_metadata(self, name: str) -> Dict[str, Any]:
|
|
58
|
+
full_name = self._full_name(name)
|
|
59
|
+
table_info = self.fe_client.get_table(full_name)
|
|
60
|
+
return {"name": full_name, "primary_keys": table_info.primary_keys, "features": table_info.features}
|
|
61
|
+
|
|
62
|
+
def list_tables(self) -> List[str]:
|
|
63
|
+
tables = self.fe_client.list_tables()
|
|
64
|
+
return [t.name for t in tables if t.name.startswith(f"{self.catalog}.{self.schema}")]
|
|
65
|
+
|
|
66
|
+
def delete_table(self, name: str) -> AdapterResult:
|
|
67
|
+
full_name = self._full_name(name)
|
|
68
|
+
self.fe_client.drop_table(full_name)
|
|
69
|
+
return AdapterResult(success=True)
|
|
70
|
+
|
|
71
|
+
def register_feature_view(self, config: FeatureViewConfig, df: pd.DataFrame) -> str:
|
|
72
|
+
table_name = self._full_name(config.name)
|
|
73
|
+
spark = get_spark_session()
|
|
74
|
+
spark_df = spark.createDataFrame(df)
|
|
75
|
+
self.fe_client.create_table(name=table_name, primary_keys=[config.entity_key], df=spark_df)
|
|
76
|
+
return table_name
|
|
77
|
+
|
|
78
|
+
def get_historical_features(self, entity_df: pd.DataFrame, feature_refs: List[str]) -> pd.DataFrame:
|
|
79
|
+
from databricks.feature_engineering import FeatureLookup
|
|
80
|
+
spark = get_spark_session()
|
|
81
|
+
lookups = [FeatureLookup(table_name=ref.split(":")[0], lookup_key=[entity_df.columns[0]]) for ref in feature_refs]
|
|
82
|
+
training_set = self.fe_client.create_training_set(df=spark.createDataFrame(entity_df), feature_lookups=lookups, label=None)
|
|
83
|
+
return training_set.load_df().toPandas()
|
|
84
|
+
|
|
85
|
+
def materialize(self, feature_views: List[str], start_date: str, end_date: str) -> None:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
def get_online_features(self, entity_keys: Dict[str, List[Any]], feature_refs: List[str]) -> Dict:
|
|
89
|
+
entity_df = pd.DataFrame(entity_keys)
|
|
90
|
+
spark = get_spark_session()
|
|
91
|
+
from databricks.feature_engineering import FeatureLookup
|
|
92
|
+
lookups = [FeatureLookup(table_name=ref.split(":")[0], lookup_key=list(entity_keys.keys())) for ref in feature_refs]
|
|
93
|
+
result = self.fe_client.score_batch(df=spark.createDataFrame(entity_df), feature_lookups=lookups)
|
|
94
|
+
return result.toPandas().to_dict()
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from ..base import AdapterResult
|
|
8
|
+
from .base import FeatureStoreAdapter, FeatureViewConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FeastAdapter(FeatureStoreAdapter):
|
|
12
|
+
def __init__(self, repo_path: str = "./feature_store/feature_repo"):
|
|
13
|
+
self._repo_path = repo_path
|
|
14
|
+
self._store = None
|
|
15
|
+
self._feature_views: Dict[str, FeatureViewConfig] = {}
|
|
16
|
+
self._data_sources: Dict[str, pd.DataFrame] = {}
|
|
17
|
+
self.storage = _get_storage()
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def store(self):
|
|
21
|
+
if self._store is None:
|
|
22
|
+
from feast import FeatureStore
|
|
23
|
+
self._store = FeatureStore(repo_path=self._repo_path)
|
|
24
|
+
return self._store
|
|
25
|
+
|
|
26
|
+
def register_feature_view(self, config: FeatureViewConfig, df: pd.DataFrame) -> str:
|
|
27
|
+
self._feature_views[config.name] = config
|
|
28
|
+
self._data_sources[config.name] = df
|
|
29
|
+
data_dir = Path(self._repo_path) / "data"
|
|
30
|
+
data_dir.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
if self.storage:
|
|
32
|
+
self.storage.write(df, str(data_dir / config.name))
|
|
33
|
+
else:
|
|
34
|
+
df.to_parquet(data_dir / f"{config.name}.parquet", index=False)
|
|
35
|
+
return config.name
|
|
36
|
+
|
|
37
|
+
def get_historical_features(self, entity_df: pd.DataFrame, feature_refs: List[str]) -> pd.DataFrame:
|
|
38
|
+
return self.store.get_historical_features(entity_df=entity_df, features=feature_refs).to_df()
|
|
39
|
+
|
|
40
|
+
def materialize(self, feature_views: List[str], start_date: str, end_date: str) -> None:
|
|
41
|
+
self.store.materialize(
|
|
42
|
+
start_date=datetime.fromisoformat(start_date),
|
|
43
|
+
end_date=datetime.fromisoformat(end_date),
|
|
44
|
+
feature_views=feature_views
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def get_online_features(self, entity_keys: Dict[str, List[Any]], feature_refs: List[str]) -> Dict:
|
|
48
|
+
entity_rows = [{k: v[i] for k, v in entity_keys.items()} for i in range(len(next(iter(entity_keys.values()))))]
|
|
49
|
+
return self.store.get_online_features(features=feature_refs, entity_rows=entity_rows).to_dict()
|
|
50
|
+
|
|
51
|
+
def create_table(self, name: str, schema: Dict[str, str], primary_keys: List[str]) -> AdapterResult:
|
|
52
|
+
config = FeatureViewConfig(name=name, entity_key=primary_keys[0], features=list(schema.keys()))
|
|
53
|
+
self._feature_views[name] = config
|
|
54
|
+
return AdapterResult(success=True, metadata={"name": name})
|
|
55
|
+
|
|
56
|
+
def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
|
|
57
|
+
if name not in self._feature_views:
|
|
58
|
+
return AdapterResult(success=False, error=f"Feature view {name} not found")
|
|
59
|
+
config = self._feature_views[name]
|
|
60
|
+
self.register_feature_view(config, df)
|
|
61
|
+
return AdapterResult(success=True)
|
|
62
|
+
|
|
63
|
+
def read_table(self, name: str, version: Optional[int] = None) -> pd.DataFrame:
|
|
64
|
+
if name not in self._data_sources:
|
|
65
|
+
delta_path = Path(self._repo_path) / "data" / name
|
|
66
|
+
parquet_path = Path(self._repo_path) / "data" / f"{name}.parquet"
|
|
67
|
+
if self.storage and delta_path.is_dir() and self.storage.exists(str(delta_path)):
|
|
68
|
+
return self.storage.read(str(delta_path), version=version)
|
|
69
|
+
if parquet_path.exists():
|
|
70
|
+
return pd.read_parquet(parquet_path)
|
|
71
|
+
raise KeyError(f"Feature view {name} not found")
|
|
72
|
+
return self._data_sources[name]
|
|
73
|
+
|
|
74
|
+
def get_table_metadata(self, name: str) -> Dict[str, Any]:
|
|
75
|
+
if name not in self._feature_views:
|
|
76
|
+
raise KeyError(f"Feature view {name} not found")
|
|
77
|
+
config = self._feature_views[name]
|
|
78
|
+
return {"name": config.name, "entity_key": config.entity_key, "features": config.features, "ttl_days": config.ttl_days}
|
|
79
|
+
|
|
80
|
+
def list_tables(self) -> List[str]:
|
|
81
|
+
return list(self._feature_views.keys())
|
|
82
|
+
|
|
83
|
+
def delete_table(self, name: str) -> AdapterResult:
|
|
84
|
+
if name not in self._feature_views:
|
|
85
|
+
return AdapterResult(success=False, error=f"Feature view {name} not found")
|
|
86
|
+
del self._feature_views[name]
|
|
87
|
+
if name in self._data_sources:
|
|
88
|
+
del self._data_sources[name]
|
|
89
|
+
return AdapterResult(success=True)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _get_storage():
|
|
93
|
+
try:
|
|
94
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
95
|
+
return get_delta(force_local=True)
|
|
96
|
+
except ImportError:
|
|
97
|
+
return None
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from ..base import AdapterResult
|
|
8
|
+
from ..storage import LocalDelta
|
|
9
|
+
from .base import FeatureStoreAdapter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LocalFeatureStore(FeatureStoreAdapter):
|
|
13
|
+
def __init__(self, base_path: str = "./feature_store"):
|
|
14
|
+
self.base_path = Path(base_path)
|
|
15
|
+
self.base_path.mkdir(parents=True, exist_ok=True)
|
|
16
|
+
self.registry_path = self.base_path / "registry.json"
|
|
17
|
+
self.storage = LocalDelta()
|
|
18
|
+
self._load_registry()
|
|
19
|
+
|
|
20
|
+
def _load_registry(self) -> None:
|
|
21
|
+
if self.registry_path.exists():
|
|
22
|
+
with open(self.registry_path) as f:
|
|
23
|
+
self._registry = json.load(f)
|
|
24
|
+
else:
|
|
25
|
+
self._registry = {"tables": {}}
|
|
26
|
+
|
|
27
|
+
def _save_registry(self) -> None:
|
|
28
|
+
with open(self.registry_path, "w") as f:
|
|
29
|
+
json.dump(self._registry, f, indent=2)
|
|
30
|
+
|
|
31
|
+
def _table_path(self, name: str) -> str:
|
|
32
|
+
return str(self.base_path / "tables" / name)
|
|
33
|
+
|
|
34
|
+
def create_table(self, name: str, schema: Dict[str, str], primary_keys: List[str]) -> AdapterResult:
|
|
35
|
+
self._registry["tables"][name] = {
|
|
36
|
+
"schema": schema,
|
|
37
|
+
"primary_keys": primary_keys,
|
|
38
|
+
"path": self._table_path(name)
|
|
39
|
+
}
|
|
40
|
+
self._save_registry()
|
|
41
|
+
return AdapterResult(success=True, metadata={"name": name})
|
|
42
|
+
|
|
43
|
+
def write_table(self, name: str, df: pd.DataFrame, mode: str = "merge") -> AdapterResult:
|
|
44
|
+
if name not in self._registry["tables"]:
|
|
45
|
+
return AdapterResult(success=False, error=f"Table {name} not found")
|
|
46
|
+
table_info = self._registry["tables"][name]
|
|
47
|
+
path = table_info["path"]
|
|
48
|
+
if mode == "merge" and Path(path).exists():
|
|
49
|
+
primary_keys = table_info["primary_keys"]
|
|
50
|
+
condition = " AND ".join([f"source.{k} = target.{k}" for k in primary_keys])
|
|
51
|
+
self.storage.merge(df, path, condition)
|
|
52
|
+
else:
|
|
53
|
+
self.storage.write(df, path, mode="overwrite" if mode == "merge" else mode)
|
|
54
|
+
return AdapterResult(success=True)
|
|
55
|
+
|
|
56
|
+
def read_table(self, name: str, version: Optional[int] = None) -> pd.DataFrame:
|
|
57
|
+
if name not in self._registry["tables"]:
|
|
58
|
+
raise KeyError(f"Table {name} not found")
|
|
59
|
+
path = self._registry["tables"][name]["path"]
|
|
60
|
+
return self.storage.read(path, version=version)
|
|
61
|
+
|
|
62
|
+
def get_table_metadata(self, name: str) -> Dict[str, Any]:
|
|
63
|
+
if name not in self._registry["tables"]:
|
|
64
|
+
raise KeyError(f"Table {name} not found")
|
|
65
|
+
return self._registry["tables"][name]
|
|
66
|
+
|
|
67
|
+
def list_tables(self) -> List[str]:
|
|
68
|
+
return list(self._registry["tables"].keys())
|
|
69
|
+
|
|
70
|
+
def delete_table(self, name: str) -> AdapterResult:
|
|
71
|
+
if name not in self._registry["tables"]:
|
|
72
|
+
return AdapterResult(success=False, error=f"Table {name} not found")
|
|
73
|
+
del self._registry["tables"][name]
|
|
74
|
+
self._save_registry()
|
|
75
|
+
return AdapterResult(success=True)
|