churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Dict, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class WarningLevel(Enum):
|
|
9
|
+
LOW = "low"
|
|
10
|
+
MEDIUM = "medium"
|
|
11
|
+
HIGH = "high"
|
|
12
|
+
CRITICAL = "critical"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SignalType(Enum):
|
|
16
|
+
ACTIVITY_DROP = "activity_drop"
|
|
17
|
+
DORMANT_RISK = "dormant_risk"
|
|
18
|
+
SUPPORT_SPIKE = "support_spike"
|
|
19
|
+
PAYMENT_ISSUE = "payment_issue"
|
|
20
|
+
EXPLICIT_SIGNAL = "explicit_signal"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class EarlyWarningConfig:
|
|
25
|
+
activity_drop_threshold: float = 0.50
|
|
26
|
+
dormant_days_threshold: int = 14
|
|
27
|
+
support_spike_count: int = 3
|
|
28
|
+
support_spike_window_days: int = 7
|
|
29
|
+
low_threshold: float = 0.30
|
|
30
|
+
medium_threshold: float = 0.30
|
|
31
|
+
high_threshold: float = 0.50
|
|
32
|
+
critical_threshold: float = 0.90
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class WarningResult:
|
|
37
|
+
customer_id: str
|
|
38
|
+
warning_score: float
|
|
39
|
+
warning_level: WarningLevel
|
|
40
|
+
warning_signals: List[SignalType]
|
|
41
|
+
primary_signal: Optional[SignalType]
|
|
42
|
+
timestamp: datetime
|
|
43
|
+
recommended_action: Optional[str]
|
|
44
|
+
|
|
45
|
+
def to_alert(self):
|
|
46
|
+
from customer_retention.stages.monitoring import Alert, AlertLevel
|
|
47
|
+
level_mapping = {
|
|
48
|
+
WarningLevel.LOW: AlertLevel.INFO,
|
|
49
|
+
WarningLevel.MEDIUM: AlertLevel.WARNING,
|
|
50
|
+
WarningLevel.HIGH: AlertLevel.WARNING,
|
|
51
|
+
WarningLevel.CRITICAL: AlertLevel.CRITICAL
|
|
52
|
+
}
|
|
53
|
+
return Alert(
|
|
54
|
+
alert_id=f"streaming_warning_{self.customer_id}_{self.timestamp.isoformat()}",
|
|
55
|
+
condition_id="STREAMING_WARNING",
|
|
56
|
+
level=level_mapping.get(self.warning_level, AlertLevel.INFO),
|
|
57
|
+
message=f"Early warning for customer {self.customer_id}: {self.warning_level.value} risk (score: {self.warning_score:.2f})",
|
|
58
|
+
timestamp=self.timestamp
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class SignalDetector:
|
|
63
|
+
def __init__(self, activity_drop_threshold: float = 0.50, dormant_days_threshold: int = 14,
|
|
64
|
+
support_spike_threshold: int = 3):
|
|
65
|
+
self._activity_threshold = activity_drop_threshold
|
|
66
|
+
self._dormant_threshold = dormant_days_threshold
|
|
67
|
+
self._support_threshold = support_spike_threshold
|
|
68
|
+
|
|
69
|
+
def detect(self, features: Dict[str, float]) -> List[SignalType]:
|
|
70
|
+
signals = []
|
|
71
|
+
if features.get("activity_drop_7d", 0) >= self._activity_threshold:
|
|
72
|
+
signals.append(SignalType.ACTIVITY_DROP)
|
|
73
|
+
if features.get("days_since_last_order", 0) >= self._dormant_threshold:
|
|
74
|
+
signals.append(SignalType.DORMANT_RISK)
|
|
75
|
+
if features.get("support_tickets_7d", 0) >= self._support_threshold:
|
|
76
|
+
signals.append(SignalType.SUPPORT_SPIKE)
|
|
77
|
+
if features.get("payment_failure", 0) > 0:
|
|
78
|
+
signals.append(SignalType.PAYMENT_ISSUE)
|
|
79
|
+
if features.get("email_unsubscribe", 0) > 0:
|
|
80
|
+
signals.append(SignalType.EXPLICIT_SIGNAL)
|
|
81
|
+
return signals
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class EarlyWarningModel:
|
|
85
|
+
def __init__(self, config: Optional[EarlyWarningConfig] = None):
|
|
86
|
+
self._config = config or EarlyWarningConfig()
|
|
87
|
+
self._signal_detector = SignalDetector(
|
|
88
|
+
activity_drop_threshold=self._config.activity_drop_threshold,
|
|
89
|
+
dormant_days_threshold=self._config.dormant_days_threshold,
|
|
90
|
+
support_spike_threshold=self._config.support_spike_count
|
|
91
|
+
)
|
|
92
|
+
self._weights = {
|
|
93
|
+
"activity_drop_7d": 0.25,
|
|
94
|
+
"days_since_last_order": 0.20,
|
|
95
|
+
"support_tickets_7d": 0.20,
|
|
96
|
+
"email_unsubscribe": 0.15,
|
|
97
|
+
"payment_failure": 0.30,
|
|
98
|
+
"session_abandon_rate": 0.10,
|
|
99
|
+
"negative_review": 0.10
|
|
100
|
+
}
|
|
101
|
+
self._trained = False
|
|
102
|
+
self._model = None
|
|
103
|
+
|
|
104
|
+
def predict(self, customer_id: str, features: Dict[str, float]) -> WarningResult:
|
|
105
|
+
score = self._compute_score(features)
|
|
106
|
+
level = self.score_to_level(score)
|
|
107
|
+
signals = self._signal_detector.detect(features)
|
|
108
|
+
primary = self._get_primary_signal(features, signals)
|
|
109
|
+
action = self._get_recommended_action(primary, level)
|
|
110
|
+
return WarningResult(
|
|
111
|
+
customer_id=customer_id,
|
|
112
|
+
warning_score=score,
|
|
113
|
+
warning_level=level,
|
|
114
|
+
warning_signals=signals,
|
|
115
|
+
primary_signal=primary,
|
|
116
|
+
timestamp=datetime.now(),
|
|
117
|
+
recommended_action=action
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def predict_batch(self, customers: Dict[str, Dict[str, float]]) -> Dict[str, WarningResult]:
|
|
121
|
+
return {cust_id: self.predict(cust_id, features) for cust_id, features in customers.items()}
|
|
122
|
+
|
|
123
|
+
def score_to_level(self, score: float) -> WarningLevel:
|
|
124
|
+
if score >= self._config.critical_threshold:
|
|
125
|
+
return WarningLevel.CRITICAL
|
|
126
|
+
elif score >= self._config.high_threshold:
|
|
127
|
+
return WarningLevel.HIGH
|
|
128
|
+
elif score >= self._config.medium_threshold:
|
|
129
|
+
return WarningLevel.MEDIUM
|
|
130
|
+
return WarningLevel.LOW
|
|
131
|
+
|
|
132
|
+
def get_feature_importance(self) -> Dict[str, float]:
|
|
133
|
+
return self._weights.copy()
|
|
134
|
+
|
|
135
|
+
def train(self, training_data: List[Tuple[Dict[str, float], int]]):
|
|
136
|
+
try:
|
|
137
|
+
import numpy as np
|
|
138
|
+
from sklearn.linear_model import LogisticRegression
|
|
139
|
+
feature_names = sorted(self._weights.keys())
|
|
140
|
+
X = []
|
|
141
|
+
y = []
|
|
142
|
+
for features, label in training_data:
|
|
143
|
+
row = [features.get(f, 0.0) for f in feature_names]
|
|
144
|
+
X.append(row)
|
|
145
|
+
y.append(label)
|
|
146
|
+
self._model = LogisticRegression()
|
|
147
|
+
self._model.fit(np.array(X), np.array(y))
|
|
148
|
+
self._trained = True
|
|
149
|
+
for i, name in enumerate(feature_names):
|
|
150
|
+
self._weights[name] = abs(self._model.coef_[0][i])
|
|
151
|
+
total = sum(self._weights.values())
|
|
152
|
+
self._weights = {k: v / total for k, v in self._weights.items()}
|
|
153
|
+
except ImportError:
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
def to_bytes(self) -> bytes:
|
|
157
|
+
return pickle.dumps({
|
|
158
|
+
"config": self._config,
|
|
159
|
+
"weights": self._weights,
|
|
160
|
+
"model": self._model,
|
|
161
|
+
"trained": self._trained
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
@classmethod
|
|
165
|
+
def from_bytes(cls, data: bytes) -> "EarlyWarningModel":
|
|
166
|
+
loaded = pickle.loads(data)
|
|
167
|
+
model = cls(config=loaded["config"])
|
|
168
|
+
model._weights = loaded["weights"]
|
|
169
|
+
model._model = loaded["model"]
|
|
170
|
+
model._trained = loaded["trained"]
|
|
171
|
+
return model
|
|
172
|
+
|
|
173
|
+
def _compute_score(self, features: Dict[str, float]) -> float:
|
|
174
|
+
if self._trained and self._model:
|
|
175
|
+
try:
|
|
176
|
+
import numpy as np
|
|
177
|
+
feature_names = sorted(self._weights.keys())
|
|
178
|
+
X = [[features.get(f, 0.0) for f in feature_names]]
|
|
179
|
+
return float(self._model.predict_proba(np.array(X))[0][1])
|
|
180
|
+
except Exception:
|
|
181
|
+
pass
|
|
182
|
+
score = 0.0
|
|
183
|
+
normalized_features = self._normalize_features(features)
|
|
184
|
+
for feature_name, weight in self._weights.items():
|
|
185
|
+
value = normalized_features.get(feature_name, 0.0)
|
|
186
|
+
score += weight * value
|
|
187
|
+
return min(max(score, 0.0), 1.0)
|
|
188
|
+
|
|
189
|
+
def _normalize_features(self, features: Dict[str, float]) -> Dict[str, float]:
|
|
190
|
+
normalized = {}
|
|
191
|
+
normalized["activity_drop_7d"] = min(features.get("activity_drop_7d", 0), 1.0)
|
|
192
|
+
days = features.get("days_since_last_order", 0)
|
|
193
|
+
normalized["days_since_last_order"] = min(days / 30.0, 1.0)
|
|
194
|
+
tickets = features.get("support_tickets_7d", 0)
|
|
195
|
+
normalized["support_tickets_7d"] = min(tickets / 5.0, 1.0)
|
|
196
|
+
normalized["email_unsubscribe"] = min(features.get("email_unsubscribe", 0), 1.0)
|
|
197
|
+
normalized["payment_failure"] = min(features.get("payment_failure", 0), 1.0)
|
|
198
|
+
normalized["session_abandon_rate"] = min(features.get("session_abandon_rate", 0), 1.0)
|
|
199
|
+
normalized["negative_review"] = min(features.get("negative_review", 0), 1.0)
|
|
200
|
+
return normalized
|
|
201
|
+
|
|
202
|
+
def _get_primary_signal(self, features: Dict[str, float], signals: List[SignalType]) -> Optional[SignalType]:
|
|
203
|
+
if not signals:
|
|
204
|
+
return None
|
|
205
|
+
priority = [
|
|
206
|
+
SignalType.PAYMENT_ISSUE,
|
|
207
|
+
SignalType.EXPLICIT_SIGNAL,
|
|
208
|
+
SignalType.ACTIVITY_DROP,
|
|
209
|
+
SignalType.SUPPORT_SPIKE,
|
|
210
|
+
SignalType.DORMANT_RISK
|
|
211
|
+
]
|
|
212
|
+
for signal in priority:
|
|
213
|
+
if signal in signals:
|
|
214
|
+
return signal
|
|
215
|
+
return signals[0]
|
|
216
|
+
|
|
217
|
+
def _get_recommended_action(self, primary_signal: Optional[SignalType], level: WarningLevel) -> Optional[str]:
|
|
218
|
+
if not primary_signal:
|
|
219
|
+
return None
|
|
220
|
+
action_mapping = {
|
|
221
|
+
SignalType.PAYMENT_ISSUE: "phone_call",
|
|
222
|
+
SignalType.EXPLICIT_SIGNAL: "immediate_escalation",
|
|
223
|
+
SignalType.ACTIVITY_DROP: "email_campaign",
|
|
224
|
+
SignalType.SUPPORT_SPIKE: "cs_followup",
|
|
225
|
+
SignalType.DORMANT_RISK: "re_engagement_email"
|
|
226
|
+
}
|
|
227
|
+
return action_mapping.get(primary_signal)
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EventSource(Enum):
|
|
9
|
+
WEBSITE = "website"
|
|
10
|
+
MOBILE_APP = "mobile_app"
|
|
11
|
+
EMAIL = "email"
|
|
12
|
+
SUPPORT = "support"
|
|
13
|
+
PURCHASE = "purchase"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EventType(Enum):
|
|
17
|
+
PAGE_VIEW = "page_view"
|
|
18
|
+
CLICK = "click"
|
|
19
|
+
SEARCH = "search"
|
|
20
|
+
APP_SESSION = "app_session"
|
|
21
|
+
APP_ACTION = "app_action"
|
|
22
|
+
APP_CRASH = "app_crash"
|
|
23
|
+
EMAIL_OPEN = "email_open"
|
|
24
|
+
EMAIL_CLICK = "email_click"
|
|
25
|
+
EMAIL_UNSUBSCRIBE = "email_unsubscribe"
|
|
26
|
+
SUPPORT_TICKET = "support_ticket"
|
|
27
|
+
SUPPORT_CHAT = "support_chat"
|
|
28
|
+
SUPPORT_CALL = "support_call"
|
|
29
|
+
ORDER = "order"
|
|
30
|
+
RETURN = "return"
|
|
31
|
+
REFUND = "refund"
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def source_category(self) -> EventSource:
|
|
35
|
+
mapping = {
|
|
36
|
+
EventType.PAGE_VIEW: EventSource.WEBSITE,
|
|
37
|
+
EventType.CLICK: EventSource.WEBSITE,
|
|
38
|
+
EventType.SEARCH: EventSource.WEBSITE,
|
|
39
|
+
EventType.APP_SESSION: EventSource.MOBILE_APP,
|
|
40
|
+
EventType.APP_ACTION: EventSource.MOBILE_APP,
|
|
41
|
+
EventType.APP_CRASH: EventSource.MOBILE_APP,
|
|
42
|
+
EventType.EMAIL_OPEN: EventSource.EMAIL,
|
|
43
|
+
EventType.EMAIL_CLICK: EventSource.EMAIL,
|
|
44
|
+
EventType.EMAIL_UNSUBSCRIBE: EventSource.EMAIL,
|
|
45
|
+
EventType.SUPPORT_TICKET: EventSource.SUPPORT,
|
|
46
|
+
EventType.SUPPORT_CHAT: EventSource.SUPPORT,
|
|
47
|
+
EventType.SUPPORT_CALL: EventSource.SUPPORT,
|
|
48
|
+
EventType.ORDER: EventSource.PURCHASE,
|
|
49
|
+
EventType.RETURN: EventSource.PURCHASE,
|
|
50
|
+
EventType.REFUND: EventSource.PURCHASE,
|
|
51
|
+
}
|
|
52
|
+
return mapping[self]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class Event:
|
|
57
|
+
event_id: str
|
|
58
|
+
customer_id: str
|
|
59
|
+
event_type: EventType
|
|
60
|
+
event_timestamp: datetime
|
|
61
|
+
event_source: EventSource
|
|
62
|
+
event_properties: Dict[str, Any]
|
|
63
|
+
session_id: Optional[str] = None
|
|
64
|
+
device_type: Optional[str] = None
|
|
65
|
+
ingestion_timestamp: datetime = field(default_factory=datetime.now)
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def ingestion_latency_seconds(self) -> float:
|
|
69
|
+
return (self.ingestion_timestamp - self.event_timestamp).total_seconds()
|
|
70
|
+
|
|
71
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
72
|
+
return {
|
|
73
|
+
"event_id": self.event_id,
|
|
74
|
+
"customer_id": self.customer_id,
|
|
75
|
+
"event_type": self.event_type.value,
|
|
76
|
+
"event_timestamp": self.event_timestamp.isoformat(),
|
|
77
|
+
"event_source": self.event_source.value,
|
|
78
|
+
"event_properties": self.event_properties,
|
|
79
|
+
"session_id": self.session_id,
|
|
80
|
+
"device_type": self.device_type,
|
|
81
|
+
"ingestion_timestamp": self.ingestion_timestamp.isoformat()
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
def to_json(self) -> str:
|
|
85
|
+
return json.dumps(self.to_dict(), indent=2)
|
|
86
|
+
|
|
87
|
+
@classmethod
|
|
88
|
+
def from_dict(cls, data: Dict[str, Any]) -> "Event":
|
|
89
|
+
event_type = EventType(data["event_type"]) if isinstance(data["event_type"], str) else data["event_type"]
|
|
90
|
+
event_source = EventSource(data["event_source"]) if isinstance(data["event_source"], str) else data["event_source"]
|
|
91
|
+
event_timestamp = datetime.fromisoformat(data["event_timestamp"]) if isinstance(data["event_timestamp"], str) else data["event_timestamp"]
|
|
92
|
+
ingestion_timestamp = datetime.fromisoformat(data.get("ingestion_timestamp", datetime.now().isoformat())) if isinstance(data.get("ingestion_timestamp"), str) else data.get("ingestion_timestamp", datetime.now())
|
|
93
|
+
return cls(
|
|
94
|
+
event_id=data["event_id"],
|
|
95
|
+
customer_id=data["customer_id"],
|
|
96
|
+
event_type=event_type,
|
|
97
|
+
event_timestamp=event_timestamp,
|
|
98
|
+
event_source=event_source,
|
|
99
|
+
event_properties=data.get("event_properties", {}),
|
|
100
|
+
session_id=data.get("session_id"),
|
|
101
|
+
device_type=data.get("device_type"),
|
|
102
|
+
ingestion_timestamp=ingestion_timestamp
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def to_spark_schema():
|
|
107
|
+
try:
|
|
108
|
+
from pyspark.sql.types import MapType, StringType, StructField, StructType, TimestampType
|
|
109
|
+
return StructType([
|
|
110
|
+
StructField("event_id", StringType(), False),
|
|
111
|
+
StructField("customer_id", StringType(), False),
|
|
112
|
+
StructField("event_type", StringType(), False),
|
|
113
|
+
StructField("event_timestamp", TimestampType(), False),
|
|
114
|
+
StructField("event_source", StringType(), False),
|
|
115
|
+
StructField("event_properties", MapType(StringType(), StringType()), True),
|
|
116
|
+
StructField("session_id", StringType(), True),
|
|
117
|
+
StructField("device_type", StringType(), True),
|
|
118
|
+
StructField("ingestion_timestamp", TimestampType(), False)
|
|
119
|
+
])
|
|
120
|
+
except ImportError:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@dataclass
|
|
125
|
+
class ValidationResult:
|
|
126
|
+
is_valid: bool
|
|
127
|
+
errors: List[str] = field(default_factory=list)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@dataclass
|
|
131
|
+
class BatchValidationResult:
|
|
132
|
+
total_count: int
|
|
133
|
+
valid_count: int
|
|
134
|
+
invalid_count: int
|
|
135
|
+
invalid_events: List[Event] = field(default_factory=list)
|
|
136
|
+
errors: List[str] = field(default_factory=list)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class EventValidator:
|
|
140
|
+
def __init__(self, max_age_days: int = 30, allow_future: bool = False):
|
|
141
|
+
self._max_age_days = max_age_days
|
|
142
|
+
self._allow_future = allow_future
|
|
143
|
+
|
|
144
|
+
def validate(self, event: Event) -> ValidationResult:
|
|
145
|
+
errors = []
|
|
146
|
+
if not event.event_id or event.event_id.strip() == "":
|
|
147
|
+
errors.append("event_id is required")
|
|
148
|
+
if not event.customer_id or event.customer_id.strip() == "":
|
|
149
|
+
errors.append("customer_id is required")
|
|
150
|
+
if not self._allow_future and event.event_timestamp > datetime.now():
|
|
151
|
+
errors.append("event_timestamp cannot be in the future")
|
|
152
|
+
if event.event_timestamp < datetime.now() - timedelta(days=self._max_age_days):
|
|
153
|
+
errors.append(f"event_timestamp is older than {self._max_age_days} days")
|
|
154
|
+
if event.event_type.source_category != event.event_source:
|
|
155
|
+
errors.append(f"event_type {event.event_type.value} does not match event_source {event.event_source.value}")
|
|
156
|
+
return ValidationResult(is_valid=len(errors) == 0, errors=errors)
|
|
157
|
+
|
|
158
|
+
def validate_batch(self, events: List[Event]) -> BatchValidationResult:
|
|
159
|
+
valid_count = 0
|
|
160
|
+
invalid_count = 0
|
|
161
|
+
invalid_events = []
|
|
162
|
+
all_errors = []
|
|
163
|
+
for event in events:
|
|
164
|
+
result = self.validate(event)
|
|
165
|
+
if result.is_valid:
|
|
166
|
+
valid_count += 1
|
|
167
|
+
else:
|
|
168
|
+
invalid_count += 1
|
|
169
|
+
invalid_events.append(event)
|
|
170
|
+
all_errors.extend(result.errors)
|
|
171
|
+
return BatchValidationResult(
|
|
172
|
+
total_count=len(events),
|
|
173
|
+
valid_count=valid_count,
|
|
174
|
+
invalid_count=invalid_count,
|
|
175
|
+
invalid_events=invalid_events,
|
|
176
|
+
errors=all_errors
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@dataclass
|
|
181
|
+
class EventSchema:
|
|
182
|
+
name: str
|
|
183
|
+
version: str
|
|
184
|
+
required_properties: List[str]
|
|
185
|
+
optional_properties: List[str] = field(default_factory=list)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class SchemaRegistry:
|
|
189
|
+
def __init__(self):
|
|
190
|
+
self._schemas: Dict[str, Dict[str, EventSchema]] = {}
|
|
191
|
+
|
|
192
|
+
def register(self, schema: EventSchema):
|
|
193
|
+
if schema.name not in self._schemas:
|
|
194
|
+
self._schemas[schema.name] = {}
|
|
195
|
+
self._schemas[schema.name][schema.version] = schema
|
|
196
|
+
|
|
197
|
+
def get(self, name: str, version: str) -> Optional[EventSchema]:
|
|
198
|
+
return self._schemas.get(name, {}).get(version)
|
|
199
|
+
|
|
200
|
+
def get_latest(self, name: str) -> Optional[EventSchema]:
|
|
201
|
+
if name not in self._schemas:
|
|
202
|
+
return None
|
|
203
|
+
versions = sorted(self._schemas[name].keys())
|
|
204
|
+
return self._schemas[name][versions[-1]] if versions else None
|
|
205
|
+
|
|
206
|
+
def validate_event(self, event: Event, schema_name: str, version: str) -> ValidationResult:
|
|
207
|
+
schema = self.get(schema_name, version)
|
|
208
|
+
if not schema:
|
|
209
|
+
return ValidationResult(is_valid=False, errors=[f"Schema {schema_name}:{version} not found"])
|
|
210
|
+
errors = []
|
|
211
|
+
for prop in schema.required_properties:
|
|
212
|
+
if prop not in event.event_properties:
|
|
213
|
+
errors.append(f"Required property '{prop}' is missing")
|
|
214
|
+
return ValidationResult(is_valid=len(errors) == 0, errors=errors)
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import statistics
|
|
2
|
+
import time
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from customer_retention.core.compat import DataFrame, pd
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class FeatureStoreConfig:
|
|
13
|
+
backend: str = "simulation"
|
|
14
|
+
read_timeout_ms: int = 100
|
|
15
|
+
write_timeout_ms: int = 200
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class TTLConfig:
|
|
20
|
+
default_ttl_seconds: int = 86400
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class FeatureRecord:
|
|
25
|
+
customer_id: str
|
|
26
|
+
feature_name: str
|
|
27
|
+
feature_value: float
|
|
28
|
+
updated_at: datetime = field(default_factory=datetime.now)
|
|
29
|
+
ttl_seconds: Optional[int] = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class FeatureWriteResult:
|
|
34
|
+
success: bool
|
|
35
|
+
features_written: int = 0
|
|
36
|
+
latency_ms: float = 0.0
|
|
37
|
+
error: Optional[str] = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class BatchSyncResult:
|
|
42
|
+
success: bool
|
|
43
|
+
customers_synced: int = 0
|
|
44
|
+
features_synced: int = 0
|
|
45
|
+
error: Optional[str] = None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class FeatureStoreMetrics:
|
|
50
|
+
avg_read_latency_ms: float = 0.0
|
|
51
|
+
p99_read_latency_ms: float = 0.0
|
|
52
|
+
avg_write_latency_ms: float = 0.0
|
|
53
|
+
p99_write_latency_ms: float = 0.0
|
|
54
|
+
cache_hit_rate: float = 0.0
|
|
55
|
+
total_reads: int = 0
|
|
56
|
+
total_writes: int = 0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class FreshnessMetrics:
|
|
61
|
+
avg_freshness_seconds: float = 0.0
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class FeatureStoreSchema:
|
|
66
|
+
columns: List[str] = field(default_factory=lambda: ["customer_id", "feature_name", "feature_value", "updated_at", "ttl"])
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class OnlineFeatureStore:
|
|
70
|
+
def __init__(self, config: Optional[FeatureStoreConfig] = None):
|
|
71
|
+
self._config = config or FeatureStoreConfig()
|
|
72
|
+
self._store: Dict[str, Dict[str, FeatureRecord]] = defaultdict(dict)
|
|
73
|
+
self._history: Dict[str, Dict[str, List[FeatureRecord]]] = defaultdict(lambda: defaultdict(list))
|
|
74
|
+
self._ttl_config = TTLConfig()
|
|
75
|
+
self._read_latencies: List[float] = []
|
|
76
|
+
self._write_latencies: List[float] = []
|
|
77
|
+
self._cache_hits = 0
|
|
78
|
+
self._cache_misses = 0
|
|
79
|
+
|
|
80
|
+
def write(self, record: FeatureRecord) -> FeatureWriteResult:
|
|
81
|
+
start = time.time()
|
|
82
|
+
try:
|
|
83
|
+
record.updated_at = datetime.now()
|
|
84
|
+
self._store[record.customer_id][record.feature_name] = record
|
|
85
|
+
self._history[record.customer_id][record.feature_name].append(record)
|
|
86
|
+
latency = (time.time() - start) * 1000
|
|
87
|
+
self._write_latencies.append(latency)
|
|
88
|
+
return FeatureWriteResult(success=True, features_written=1, latency_ms=latency)
|
|
89
|
+
except Exception as e:
|
|
90
|
+
return FeatureWriteResult(success=False, error=str(e))
|
|
91
|
+
|
|
92
|
+
def write_batch(self, customer_id: str, features: Dict[str, float]) -> FeatureWriteResult:
|
|
93
|
+
start = time.time()
|
|
94
|
+
try:
|
|
95
|
+
for name, value in features.items():
|
|
96
|
+
record = FeatureRecord(
|
|
97
|
+
customer_id=customer_id,
|
|
98
|
+
feature_name=name,
|
|
99
|
+
feature_value=value,
|
|
100
|
+
updated_at=datetime.now()
|
|
101
|
+
)
|
|
102
|
+
self._store[customer_id][name] = record
|
|
103
|
+
self._history[customer_id][name].append(record)
|
|
104
|
+
latency = (time.time() - start) * 1000
|
|
105
|
+
self._write_latencies.append(latency)
|
|
106
|
+
return FeatureWriteResult(success=True, features_written=len(features), latency_ms=latency)
|
|
107
|
+
except Exception as e:
|
|
108
|
+
return FeatureWriteResult(success=False, error=str(e))
|
|
109
|
+
|
|
110
|
+
def read(self, customer_id: str, feature_name: str) -> Optional[FeatureRecord]:
|
|
111
|
+
start = time.time()
|
|
112
|
+
record = self._store.get(customer_id, {}).get(feature_name)
|
|
113
|
+
if record and record.ttl_seconds:
|
|
114
|
+
age = (datetime.now() - record.updated_at).total_seconds()
|
|
115
|
+
if age > record.ttl_seconds:
|
|
116
|
+
del self._store[customer_id][feature_name]
|
|
117
|
+
record = None
|
|
118
|
+
latency = (time.time() - start) * 1000
|
|
119
|
+
self._read_latencies.append(latency)
|
|
120
|
+
if record:
|
|
121
|
+
self._cache_hits += 1
|
|
122
|
+
else:
|
|
123
|
+
self._cache_misses += 1
|
|
124
|
+
return record
|
|
125
|
+
|
|
126
|
+
def read_batch(self, customer_id: str, feature_names: List[str]) -> Dict[str, float]:
|
|
127
|
+
start = time.time()
|
|
128
|
+
result = {}
|
|
129
|
+
for name in feature_names:
|
|
130
|
+
record = self.read(customer_id, name)
|
|
131
|
+
if record:
|
|
132
|
+
result[name] = record.feature_value
|
|
133
|
+
latency = (time.time() - start) * 1000
|
|
134
|
+
self._read_latencies.append(latency)
|
|
135
|
+
return result
|
|
136
|
+
|
|
137
|
+
def set_ttl_config(self, config: TTLConfig):
|
|
138
|
+
self._ttl_config = config
|
|
139
|
+
|
|
140
|
+
def sync_from_batch(self, offline_features: Dict[str, Dict[str, float]], merge_mode: str = "overwrite") -> BatchSyncResult:
|
|
141
|
+
try:
|
|
142
|
+
customers_synced = 0
|
|
143
|
+
features_synced = 0
|
|
144
|
+
for customer_id, features in offline_features.items():
|
|
145
|
+
for name, value in features.items():
|
|
146
|
+
if merge_mode == "preserve_streaming" and customer_id in self._store and name in self._store[customer_id]:
|
|
147
|
+
continue
|
|
148
|
+
record = FeatureRecord(
|
|
149
|
+
customer_id=customer_id,
|
|
150
|
+
feature_name=name,
|
|
151
|
+
feature_value=value,
|
|
152
|
+
updated_at=datetime.now()
|
|
153
|
+
)
|
|
154
|
+
self._store[customer_id][name] = record
|
|
155
|
+
features_synced += 1
|
|
156
|
+
customers_synced += 1
|
|
157
|
+
return BatchSyncResult(success=True, customers_synced=customers_synced, features_synced=features_synced)
|
|
158
|
+
except Exception as e:
|
|
159
|
+
return BatchSyncResult(success=False, error=str(e))
|
|
160
|
+
|
|
161
|
+
def cleanup_expired(self) -> int:
|
|
162
|
+
expired_count = 0
|
|
163
|
+
for customer_id in list(self._store.keys()):
|
|
164
|
+
for feature_name in list(self._store[customer_id].keys()):
|
|
165
|
+
record = self._store[customer_id][feature_name]
|
|
166
|
+
if record.ttl_seconds:
|
|
167
|
+
age = (datetime.now() - record.updated_at).total_seconds()
|
|
168
|
+
if age > record.ttl_seconds:
|
|
169
|
+
del self._store[customer_id][feature_name]
|
|
170
|
+
expired_count += 1
|
|
171
|
+
return expired_count
|
|
172
|
+
|
|
173
|
+
def get_feature_history(self, customer_id: str, feature_name: str, limit: int = 10) -> List[FeatureRecord]:
|
|
174
|
+
history = self._history.get(customer_id, {}).get(feature_name, [])
|
|
175
|
+
return history[-limit:]
|
|
176
|
+
|
|
177
|
+
def read_at_time(self, customer_id: str, feature_name: str, timestamp: datetime) -> Optional[FeatureRecord]:
|
|
178
|
+
history = self._history.get(customer_id, {}).get(feature_name, [])
|
|
179
|
+
for record in reversed(history):
|
|
180
|
+
if record.updated_at <= timestamp:
|
|
181
|
+
return record
|
|
182
|
+
return history[0] if history else None
|
|
183
|
+
|
|
184
|
+
def get_metrics(self) -> FeatureStoreMetrics:
|
|
185
|
+
read_lat = self._read_latencies or [0]
|
|
186
|
+
write_lat = self._write_latencies or [0]
|
|
187
|
+
total_cache = self._cache_hits + self._cache_misses
|
|
188
|
+
return FeatureStoreMetrics(
|
|
189
|
+
avg_read_latency_ms=statistics.mean(read_lat),
|
|
190
|
+
p99_read_latency_ms=sorted(read_lat)[int(len(read_lat) * 0.99)] if len(read_lat) > 1 else read_lat[0],
|
|
191
|
+
avg_write_latency_ms=statistics.mean(write_lat),
|
|
192
|
+
p99_write_latency_ms=sorted(write_lat)[int(len(write_lat) * 0.99)] if len(write_lat) > 1 else write_lat[0],
|
|
193
|
+
cache_hit_rate=self._cache_hits / total_cache if total_cache > 0 else 0.0,
|
|
194
|
+
total_reads=len(self._read_latencies),
|
|
195
|
+
total_writes=len(self._write_latencies)
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def get_freshness_metrics(self) -> FreshnessMetrics:
|
|
199
|
+
all_ages = []
|
|
200
|
+
now = datetime.now()
|
|
201
|
+
for customer_features in self._store.values():
|
|
202
|
+
for record in customer_features.values():
|
|
203
|
+
age = (now - record.updated_at).total_seconds()
|
|
204
|
+
all_ages.append(age)
|
|
205
|
+
return FreshnessMetrics(
|
|
206
|
+
avg_freshness_seconds=statistics.mean(all_ages) if all_ages else 0.0
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def get_schema(self) -> FeatureStoreSchema:
|
|
210
|
+
return FeatureStoreSchema()
|
|
211
|
+
|
|
212
|
+
def get_feature_table_schema(self) -> List[str]:
|
|
213
|
+
return ["customer_id", "feature_name", "feature_value", "updated_at"]
|
|
214
|
+
|
|
215
|
+
def to_delta_dataframe(self) -> DataFrame:
|
|
216
|
+
rows = []
|
|
217
|
+
for customer_id, features in self._store.items():
|
|
218
|
+
for feature_name, record in features.items():
|
|
219
|
+
rows.append({
|
|
220
|
+
"customer_id": customer_id,
|
|
221
|
+
"feature_name": feature_name,
|
|
222
|
+
"feature_value": record.feature_value,
|
|
223
|
+
"updated_at": record.updated_at
|
|
224
|
+
})
|
|
225
|
+
return pd.DataFrame(rows)
|
|
226
|
+
|
|
227
|
+
def import_from_feature_table(self, feature_table: Dict[str, Dict[str, float]]) -> BatchSyncResult:
|
|
228
|
+
return self.sync_from_batch(feature_table)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class FeatureLookup:
|
|
232
|
+
def __init__(self, feature_store: OnlineFeatureStore, feature_names: List[str],
|
|
233
|
+
defaults: Optional[Dict[str, float]] = None):
|
|
234
|
+
self._store = feature_store
|
|
235
|
+
self._feature_names = feature_names
|
|
236
|
+
self._defaults = defaults or {}
|
|
237
|
+
|
|
238
|
+
def get_features(self, customer_id: str) -> Dict[str, float]:
|
|
239
|
+
result = {}
|
|
240
|
+
for name in self._feature_names:
|
|
241
|
+
record = self._store.read(customer_id, name)
|
|
242
|
+
if record:
|
|
243
|
+
result[name] = record.feature_value
|
|
244
|
+
elif name in self._defaults:
|
|
245
|
+
result[name] = self._defaults[name]
|
|
246
|
+
return result
|
|
247
|
+
|
|
248
|
+
def get_features_batch(self, customer_ids: List[str]) -> Dict[str, Dict[str, float]]:
|
|
249
|
+
return {cust_id: self.get_features(cust_id) for cust_id in customer_ids}
|