churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from .source_config import DataSourceConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TimestampStrategy(str, Enum):
|
|
10
|
+
PRODUCTION = "production"
|
|
11
|
+
SYNTHETIC_RANDOM = "synthetic_random"
|
|
12
|
+
SYNTHETIC_INDEX = "synthetic_index"
|
|
13
|
+
SYNTHETIC_FIXED = "synthetic_fixed"
|
|
14
|
+
DERIVED = "derived"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DedupStrategy(str, Enum):
|
|
18
|
+
KEEP_FIRST = "keep_first"
|
|
19
|
+
KEEP_LAST = "keep_last"
|
|
20
|
+
KEEP_MOST_COMPLETE = "keep_most_complete"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BronzeConfig(BaseModel):
|
|
24
|
+
deduplicate: bool = True
|
|
25
|
+
dedup_strategy: DedupStrategy = DedupStrategy.KEEP_LAST
|
|
26
|
+
dedup_keys: list[str] = ["custid"]
|
|
27
|
+
max_missing_pct: float = 0.95
|
|
28
|
+
min_distinct_values: int = 2
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SilverConfig(BaseModel):
|
|
32
|
+
entity_key: str = "custid"
|
|
33
|
+
reference_date_column: Optional[str] = None
|
|
34
|
+
auto_detect_encoding: bool = True
|
|
35
|
+
auto_detect_scaling: bool = True
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GoldConfig(BaseModel):
|
|
39
|
+
feature_store_catalog: str = "main"
|
|
40
|
+
feature_store_schema: str = "feature_store"
|
|
41
|
+
feature_table_name: str = "customer_features"
|
|
42
|
+
version: str = "v1"
|
|
43
|
+
|
|
44
|
+
def get_full_feature_table_name(self) -> str:
|
|
45
|
+
return f"{self.feature_store_catalog}.{self.feature_store_schema}.{self.feature_table_name}"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class ModelingConfig(BaseModel):
|
|
49
|
+
target_column: str = "retained"
|
|
50
|
+
positive_class: int = 1
|
|
51
|
+
test_size: float = 0.2
|
|
52
|
+
stratify: bool = True
|
|
53
|
+
primary_metric: str = "average_precision"
|
|
54
|
+
cost_false_negative: float = 100.0
|
|
55
|
+
cost_false_positive: float = 10.0
|
|
56
|
+
|
|
57
|
+
def get_cost_ratio(self) -> float:
|
|
58
|
+
return self.cost_false_negative / self.cost_false_positive
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ValidationConfig(BaseModel):
|
|
62
|
+
fail_on_critical: bool = True
|
|
63
|
+
fail_on_high: bool = False
|
|
64
|
+
leakage_correlation_threshold: float = 0.90
|
|
65
|
+
max_overfit_gap: float = 0.15
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TemporalConfig(BaseModel):
|
|
69
|
+
timestamp_strategy: TimestampStrategy = TimestampStrategy.PRODUCTION
|
|
70
|
+
feature_timestamp_column: Optional[str] = None
|
|
71
|
+
label_timestamp_column: Optional[str] = None
|
|
72
|
+
observation_window_days: int = 90
|
|
73
|
+
synthetic_base_date: str = "2024-01-01"
|
|
74
|
+
synthetic_range_days: int = 365
|
|
75
|
+
snapshot_prefix: str = "ml_training_snapshot"
|
|
76
|
+
enforce_point_in_time: bool = True
|
|
77
|
+
max_feature_target_correlation: float = 0.90
|
|
78
|
+
block_future_features: bool = True
|
|
79
|
+
derive_label_from_feature: bool = False
|
|
80
|
+
derivation_config: Optional[dict[str, Any]] = None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class PathConfig(BaseModel):
|
|
84
|
+
landing_zone: Optional[str] = None
|
|
85
|
+
bronze: Optional[str] = None
|
|
86
|
+
silver: Optional[str] = None
|
|
87
|
+
gold: Optional[str] = None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class PipelineConfig(BaseModel):
|
|
91
|
+
project_name: str
|
|
92
|
+
project_description: Optional[str] = None
|
|
93
|
+
version: str = "1.0.0"
|
|
94
|
+
|
|
95
|
+
data_sources: list[DataSourceConfig] = []
|
|
96
|
+
bronze: BronzeConfig = BronzeConfig()
|
|
97
|
+
silver: SilverConfig = SilverConfig()
|
|
98
|
+
gold: GoldConfig = GoldConfig()
|
|
99
|
+
modeling: ModelingConfig = ModelingConfig()
|
|
100
|
+
validation: ValidationConfig = ValidationConfig()
|
|
101
|
+
temporal: TemporalConfig = TemporalConfig()
|
|
102
|
+
paths: PathConfig = PathConfig()
|
|
103
|
+
|
|
104
|
+
def get_source_by_name(self, name: str) -> Optional[DataSourceConfig]:
|
|
105
|
+
return next((s for s in self.data_sources if s.name == name), None)
|
|
106
|
+
|
|
107
|
+
def get_target_source(self) -> Optional[DataSourceConfig]:
|
|
108
|
+
for source in self.data_sources:
|
|
109
|
+
if any(c.column_type.value == "target" for c in source.columns):
|
|
110
|
+
return source
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
def get_all_feature_columns(self) -> list[str]:
|
|
114
|
+
feature_cols = []
|
|
115
|
+
for source in self.data_sources:
|
|
116
|
+
feature_cols.extend([c.name for c in source.get_feature_columns()])
|
|
117
|
+
return feature_cols
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, model_validator
|
|
5
|
+
|
|
6
|
+
from .column_config import ColumnConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SourceType(str, Enum):
|
|
10
|
+
BATCH_FILE = "batch_file"
|
|
11
|
+
BATCH_TABLE = "batch_table"
|
|
12
|
+
STREAM = "stream"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FileFormat(str, Enum):
|
|
16
|
+
CSV = "csv"
|
|
17
|
+
PARQUET = "parquet"
|
|
18
|
+
DELTA = "delta"
|
|
19
|
+
JSON = "json"
|
|
20
|
+
ORC = "orc"
|
|
21
|
+
AVRO = "avro"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Grain(str, Enum):
|
|
25
|
+
CUSTOMER = "customer"
|
|
26
|
+
TRANSACTION = "transaction"
|
|
27
|
+
EVENT = "event"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DataSourceConfig(BaseModel):
|
|
31
|
+
name: str
|
|
32
|
+
source_type: SourceType
|
|
33
|
+
primary_key: str
|
|
34
|
+
|
|
35
|
+
path: Optional[str] = None
|
|
36
|
+
file_format: Optional[FileFormat] = None
|
|
37
|
+
|
|
38
|
+
catalog: Optional[str] = None
|
|
39
|
+
schema_name: Optional[str] = None
|
|
40
|
+
table: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
delimiter: str = ","
|
|
43
|
+
header: bool = True
|
|
44
|
+
quote_char: str = '"'
|
|
45
|
+
encoding: str = "utf-8"
|
|
46
|
+
|
|
47
|
+
columns: list[ColumnConfig] = []
|
|
48
|
+
timestamp_column: Optional[str] = None
|
|
49
|
+
customer_key: Optional[str] = None
|
|
50
|
+
grain: Optional[Grain] = None
|
|
51
|
+
|
|
52
|
+
expected_row_count_min: Optional[int] = None
|
|
53
|
+
expected_row_count_max: Optional[int] = None
|
|
54
|
+
expected_columns: Optional[list[str]] = None
|
|
55
|
+
freshness_sla_hours: Optional[int] = None
|
|
56
|
+
|
|
57
|
+
@model_validator(mode='after')
|
|
58
|
+
def validate_source_requirements(self):
|
|
59
|
+
if self.source_type == SourceType.BATCH_FILE:
|
|
60
|
+
if not self.path:
|
|
61
|
+
raise ValueError("path required for batch_file source_type")
|
|
62
|
+
if not self.file_format:
|
|
63
|
+
raise ValueError("file_format required for batch_file source_type")
|
|
64
|
+
if self.source_type == SourceType.BATCH_TABLE and not self.table:
|
|
65
|
+
raise ValueError("table required for batch_table source_type")
|
|
66
|
+
return self
|
|
67
|
+
|
|
68
|
+
def get_full_table_name(self) -> str:
|
|
69
|
+
if self.source_type != SourceType.BATCH_TABLE:
|
|
70
|
+
raise ValueError("full_table_name only applicable for batch_table")
|
|
71
|
+
parts = [p for p in [self.catalog, self.schema_name, self.table] if p]
|
|
72
|
+
return ".".join(parts)
|
|
73
|
+
|
|
74
|
+
def get_column_by_name(self, name: str) -> Optional[ColumnConfig]:
|
|
75
|
+
return next((c for c in self.columns if c.name == name), None)
|
|
76
|
+
|
|
77
|
+
def get_feature_columns(self) -> list[ColumnConfig]:
|
|
78
|
+
return [c for c in self.columns if c.should_be_used_as_feature()]
|
|
79
|
+
|
|
80
|
+
def is_cloud_path(self) -> bool:
|
|
81
|
+
if not self.path:
|
|
82
|
+
return False
|
|
83
|
+
return any(self.path.startswith(prefix) for prefix in ["s3://", "abfss://", "gs://", "wasb://", "adl://"])
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from ..components.enums import Severity
|
|
2
|
+
from .leakage import (
|
|
3
|
+
DEFAULT_THRESHOLDS,
|
|
4
|
+
TEMPORAL_METADATA_COLUMNS,
|
|
5
|
+
LeakageThresholds,
|
|
6
|
+
calculate_class_overlap,
|
|
7
|
+
classify_correlation,
|
|
8
|
+
classify_separation,
|
|
9
|
+
get_valid_feature_columns,
|
|
10
|
+
)
|
|
11
|
+
from .severity import ThresholdConfig, classify_by_thresholds, severity_recommendation
|
|
12
|
+
from .statistics import (
|
|
13
|
+
compute_chi_square,
|
|
14
|
+
compute_effect_size,
|
|
15
|
+
compute_ks_statistic,
|
|
16
|
+
compute_psi_categorical,
|
|
17
|
+
compute_psi_from_series,
|
|
18
|
+
compute_psi_numeric,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"Severity",
|
|
23
|
+
"compute_psi_numeric", "compute_psi_categorical", "compute_psi_from_series", "compute_ks_statistic", "compute_chi_square",
|
|
24
|
+
"compute_effect_size",
|
|
25
|
+
"LeakageThresholds", "classify_correlation", "classify_separation", "calculate_class_overlap", "DEFAULT_THRESHOLDS",
|
|
26
|
+
"ThresholdConfig", "classify_by_thresholds", "severity_recommendation",
|
|
27
|
+
"TEMPORAL_METADATA_COLUMNS", "get_valid_feature_columns",
|
|
28
|
+
]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import FrozenSet, List, Optional, Set, Tuple
|
|
3
|
+
|
|
4
|
+
from customer_retention.core.compat import DataFrame, Series
|
|
5
|
+
|
|
6
|
+
from ..components.enums import Severity
|
|
7
|
+
|
|
8
|
+
TEMPORAL_METADATA_COLUMNS: FrozenSet[str] = frozenset({
|
|
9
|
+
"feature_timestamp",
|
|
10
|
+
"label_timestamp",
|
|
11
|
+
"label_available_flag",
|
|
12
|
+
"event_timestamp",
|
|
13
|
+
})
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _build_exclusion_set(entity_column: Optional[str], target_column: Optional[str], additional_exclude: Optional[Set[str]]) -> Set[str]:
|
|
17
|
+
exclude = set(TEMPORAL_METADATA_COLUMNS)
|
|
18
|
+
if entity_column:
|
|
19
|
+
exclude.add(entity_column)
|
|
20
|
+
if target_column:
|
|
21
|
+
exclude.add(target_column)
|
|
22
|
+
if additional_exclude:
|
|
23
|
+
exclude.update(additional_exclude)
|
|
24
|
+
return exclude
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_valid_feature_columns(
|
|
28
|
+
df: DataFrame,
|
|
29
|
+
entity_column: Optional[str] = None,
|
|
30
|
+
target_column: Optional[str] = None,
|
|
31
|
+
additional_exclude: Optional[Set[str]] = None,
|
|
32
|
+
) -> List[str]:
|
|
33
|
+
"""Filter DataFrame columns to those valid as model features."""
|
|
34
|
+
exclude = _build_exclusion_set(entity_column, target_column, additional_exclude)
|
|
35
|
+
exclude.update(c for c in df.columns if c.startswith("original_"))
|
|
36
|
+
return [c for c in df.columns if c not in exclude]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class LeakageThresholds:
|
|
41
|
+
correlation_critical: float = 0.90
|
|
42
|
+
correlation_high: float = 0.70
|
|
43
|
+
correlation_medium: float = 0.50
|
|
44
|
+
separation_critical: float = 0.0
|
|
45
|
+
separation_high: float = 1.0
|
|
46
|
+
separation_medium: float = 5.0
|
|
47
|
+
auc_critical: float = 0.90
|
|
48
|
+
auc_high: float = 0.80
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
DEFAULT_THRESHOLDS = LeakageThresholds()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def classify_correlation(corr: float, thresholds: LeakageThresholds = DEFAULT_THRESHOLDS) -> Tuple[Severity, str]:
|
|
55
|
+
abs_corr = abs(corr)
|
|
56
|
+
if abs_corr >= thresholds.correlation_critical:
|
|
57
|
+
return Severity.CRITICAL, "high_correlation"
|
|
58
|
+
if abs_corr >= thresholds.correlation_high:
|
|
59
|
+
return Severity.HIGH, "suspicious_correlation"
|
|
60
|
+
if abs_corr >= thresholds.correlation_medium:
|
|
61
|
+
return Severity.MEDIUM, "elevated_correlation"
|
|
62
|
+
return Severity.INFO, "normal"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def classify_separation(overlap_pct: float, thresholds: LeakageThresholds = DEFAULT_THRESHOLDS) -> Tuple[Severity, str]:
|
|
66
|
+
if overlap_pct <= thresholds.separation_critical:
|
|
67
|
+
return Severity.CRITICAL, "perfect_separation"
|
|
68
|
+
if overlap_pct < thresholds.separation_high:
|
|
69
|
+
return Severity.HIGH, "near_perfect_separation"
|
|
70
|
+
if overlap_pct < thresholds.separation_medium:
|
|
71
|
+
return Severity.MEDIUM, "high_separation"
|
|
72
|
+
return Severity.INFO, "normal"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def calculate_class_overlap(feature: Series, target: Series) -> float:
|
|
76
|
+
class_0, class_1 = feature[target == 0].dropna(), feature[target == 1].dropna()
|
|
77
|
+
if len(class_0) == 0 or len(class_1) == 0:
|
|
78
|
+
return 100.0
|
|
79
|
+
min_0, max_0 = class_0.min(), class_0.max()
|
|
80
|
+
min_1, max_1 = class_1.min(), class_1.max()
|
|
81
|
+
total_range = max(max_0, max_1) - min(min_0, min_1)
|
|
82
|
+
if total_range == 0:
|
|
83
|
+
return 100.0
|
|
84
|
+
overlap = max(0, min(max_0, max_1) - max(min_0, min_1))
|
|
85
|
+
return (overlap / total_range) * 100
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from ..components.enums import Severity
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ThresholdConfig:
|
|
9
|
+
critical: Optional[float] = None
|
|
10
|
+
high: Optional[float] = None
|
|
11
|
+
warning: Optional[float] = None
|
|
12
|
+
medium: Optional[float] = None
|
|
13
|
+
low: Optional[float] = None
|
|
14
|
+
ascending: bool = True
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def classify_by_thresholds(value: float, config: ThresholdConfig) -> Severity:
|
|
18
|
+
if config.ascending:
|
|
19
|
+
if config.critical is not None and value >= config.critical:
|
|
20
|
+
return Severity.CRITICAL
|
|
21
|
+
if config.high is not None and value >= config.high:
|
|
22
|
+
return Severity.HIGH
|
|
23
|
+
if config.warning is not None and value >= config.warning:
|
|
24
|
+
return Severity.WARNING
|
|
25
|
+
if config.medium is not None and value >= config.medium:
|
|
26
|
+
return Severity.MEDIUM
|
|
27
|
+
if config.low is not None and value >= config.low:
|
|
28
|
+
return Severity.LOW
|
|
29
|
+
else:
|
|
30
|
+
if config.critical is not None and value <= config.critical:
|
|
31
|
+
return Severity.CRITICAL
|
|
32
|
+
if config.high is not None and value <= config.high:
|
|
33
|
+
return Severity.HIGH
|
|
34
|
+
if config.warning is not None and value <= config.warning:
|
|
35
|
+
return Severity.WARNING
|
|
36
|
+
if config.medium is not None and value <= config.medium:
|
|
37
|
+
return Severity.MEDIUM
|
|
38
|
+
if config.low is not None and value <= config.low:
|
|
39
|
+
return Severity.LOW
|
|
40
|
+
return Severity.INFO
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def severity_recommendation(severity: Severity, context: str, action_critical: str = "investigate immediately",
|
|
44
|
+
action_warning: str = "monitor closely", action_info: str = "no action needed") -> str:
|
|
45
|
+
recommendations = {
|
|
46
|
+
Severity.CRITICAL: f"CRITICAL: {context}. {action_critical}.",
|
|
47
|
+
Severity.HIGH: f"HIGH: {context}. {action_critical}.",
|
|
48
|
+
Severity.WARNING: f"WARNING: {context}. {action_warning}.",
|
|
49
|
+
Severity.MEDIUM: f"MEDIUM: {context}. {action_warning}.",
|
|
50
|
+
Severity.LOW: f"LOW: {context}. {action_info}.",
|
|
51
|
+
Severity.INFO: f"INFO: {context}. {action_info}.",
|
|
52
|
+
}
|
|
53
|
+
return recommendations.get(severity, f"INFO: {context}.")
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from typing import Dict, List, Tuple, Union
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from scipy import stats
|
|
5
|
+
|
|
6
|
+
from customer_retention.core.compat import Series
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _ensure_array(obj: Union[np.ndarray, List[float]]) -> np.ndarray:
|
|
10
|
+
return obj if isinstance(obj, np.ndarray) else np.array(obj)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def compute_effect_size(group1: Union[np.ndarray, List[float]], group2: Union[np.ndarray, List[float]]) -> Tuple[float, str]:
|
|
14
|
+
arr1 = _ensure_array(group1)
|
|
15
|
+
arr2 = _ensure_array(group2)
|
|
16
|
+
if len(arr1) < 2 or len(arr2) < 2:
|
|
17
|
+
return 0.0, "Negligible"
|
|
18
|
+
pooled_std = np.sqrt((np.var(arr1) + np.var(arr2)) / 2)
|
|
19
|
+
if pooled_std == 0:
|
|
20
|
+
return 0.0, "Negligible"
|
|
21
|
+
d = float((np.mean(arr1) - np.mean(arr2)) / pooled_std)
|
|
22
|
+
abs_d = abs(d)
|
|
23
|
+
if abs_d >= 0.8:
|
|
24
|
+
return d, "Large effect"
|
|
25
|
+
if abs_d >= 0.5:
|
|
26
|
+
return d, "Medium effect"
|
|
27
|
+
if abs_d >= 0.2:
|
|
28
|
+
return d, "Small effect"
|
|
29
|
+
return d, "Negligible"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def compute_psi_numeric(current: Series, reference_hist_edges: List[float], reference_hist_counts: List[int], epsilon: float = 1e-10) -> float:
|
|
33
|
+
edges = np.array(reference_hist_edges)
|
|
34
|
+
baseline_counts = np.array(reference_hist_counts)
|
|
35
|
+
current_counts, _ = np.histogram(current.dropna(), bins=edges)
|
|
36
|
+
baseline_prop = baseline_counts / baseline_counts.sum()
|
|
37
|
+
current_prop = current_counts / current_counts.sum() if current_counts.sum() > 0 else np.zeros_like(current_counts, dtype=float)
|
|
38
|
+
baseline_prop = np.maximum(baseline_prop, epsilon)
|
|
39
|
+
current_prop = np.maximum(current_prop, epsilon)
|
|
40
|
+
return float(np.sum((current_prop - baseline_prop) * np.log(current_prop / baseline_prop)))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _is_categorical_dtype(dtype) -> bool:
|
|
44
|
+
return dtype in ['object', 'category', 'bool']
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def compute_psi_from_series(reference: Series, current: Series, n_bins: int = 10, epsilon: float = 1e-10) -> float:
|
|
48
|
+
ref_clean, curr_clean = reference.dropna(), current.dropna()
|
|
49
|
+
if _is_categorical_dtype(ref_clean.dtype) or _is_categorical_dtype(curr_clean.dtype):
|
|
50
|
+
return compute_psi_categorical(ref_clean, curr_clean, epsilon)
|
|
51
|
+
min_val = min(ref_clean.min(), curr_clean.min())
|
|
52
|
+
max_val = max(ref_clean.max(), curr_clean.max())
|
|
53
|
+
bins = np.linspace(min_val, max_val, n_bins + 1)
|
|
54
|
+
ref_hist, _ = np.histogram(ref_clean, bins=bins)
|
|
55
|
+
curr_hist, _ = np.histogram(curr_clean, bins=bins)
|
|
56
|
+
ref_pct = ref_hist / len(ref_clean) + epsilon
|
|
57
|
+
curr_pct = curr_hist / len(curr_clean) + epsilon if len(curr_clean) > 0 else np.full_like(ref_hist, epsilon, dtype=float)
|
|
58
|
+
return float(np.sum((curr_pct - ref_pct) * np.log(curr_pct / ref_pct)))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def compute_psi_categorical(reference: Series, current: Series, epsilon: float = 1e-10) -> float:
|
|
62
|
+
ref_counts = reference.value_counts(normalize=True)
|
|
63
|
+
curr_counts = current.value_counts(normalize=True)
|
|
64
|
+
all_categories = set(ref_counts.index) | set(curr_counts.index)
|
|
65
|
+
psi = 0.0
|
|
66
|
+
for cat in all_categories:
|
|
67
|
+
ref_pct = ref_counts.get(cat, epsilon)
|
|
68
|
+
curr_pct = curr_counts.get(cat, epsilon)
|
|
69
|
+
psi += (curr_pct - ref_pct) * np.log((curr_pct + epsilon) / (ref_pct + epsilon))
|
|
70
|
+
return float(psi)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def compute_ks_statistic(reference: Series, current: Series) -> Tuple[float, float]:
|
|
74
|
+
ref_clean, curr_clean = reference.dropna(), current.dropna()
|
|
75
|
+
statistic, pvalue = stats.ks_2samp(ref_clean, curr_clean)
|
|
76
|
+
return float(statistic), float(pvalue)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def compute_chi_square(current: Series, baseline_proportions: Dict[str, float]) -> Tuple[float, float]:
|
|
80
|
+
current_counts = current.value_counts()
|
|
81
|
+
all_categories = sorted(set(list(current_counts.index) + list(baseline_proportions.keys())))
|
|
82
|
+
observed, expected = [], []
|
|
83
|
+
total_current = len(current)
|
|
84
|
+
for cat in all_categories:
|
|
85
|
+
observed.append(current_counts.get(cat, 0))
|
|
86
|
+
expected.append(max(baseline_proportions.get(cat, 0) * total_current, 1e-10))
|
|
87
|
+
expected_arr = np.array(expected)
|
|
88
|
+
expected_arr = expected_arr * (sum(observed) / sum(expected_arr)) if sum(expected_arr) > 0 else expected_arr
|
|
89
|
+
chi_square, pvalue = stats.chisquare(observed, expected_arr)
|
|
90
|
+
return float(chi_square), float(pvalue)
|
|
File without changes
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from .base import NotebookGenerator, NotebookStage
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from customer_retention.analysis.auto_explorer import ExplorationFindings
|
|
9
|
+
from .cell_builder import CellBuilder
|
|
10
|
+
from .config import FeatureStoreConfig, MLflowConfig, NotebookConfig, OutputFormat, Platform
|
|
11
|
+
from .databricks_generator import DatabricksNotebookGenerator
|
|
12
|
+
from .local_generator import LocalNotebookGenerator
|
|
13
|
+
from .project_init import ProjectInitializer, initialize_project
|
|
14
|
+
from .runner import (
|
|
15
|
+
NotebookRunner,
|
|
16
|
+
NotebookValidationResult,
|
|
17
|
+
ScriptRunner,
|
|
18
|
+
ValidationReport,
|
|
19
|
+
validate_generated_notebooks,
|
|
20
|
+
)
|
|
21
|
+
from .script_generator import DatabricksScriptGenerator, LocalScriptGenerator, ScriptGenerator
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class GenerationResult:
|
|
26
|
+
platform: Platform
|
|
27
|
+
notebook_paths: List[str]
|
|
28
|
+
validation_report: Optional[ValidationReport] = None
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def all_valid(self) -> bool:
|
|
32
|
+
return self.validation_report.all_passed if self.validation_report else True
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def generate_orchestration_notebooks(
|
|
36
|
+
findings_path: Optional[str] = None,
|
|
37
|
+
output_dir: str = "./generated_pipelines",
|
|
38
|
+
platforms: Optional[List[Platform]] = None,
|
|
39
|
+
config: Optional[NotebookConfig] = None,
|
|
40
|
+
validate: bool = False,
|
|
41
|
+
) -> Dict[Platform, List[str]]:
|
|
42
|
+
if platforms is None:
|
|
43
|
+
platforms = [Platform.LOCAL, Platform.DATABRICKS]
|
|
44
|
+
if config is None:
|
|
45
|
+
config = NotebookConfig()
|
|
46
|
+
|
|
47
|
+
findings = None
|
|
48
|
+
if findings_path:
|
|
49
|
+
from customer_retention.analysis.auto_explorer import ExplorationFindings
|
|
50
|
+
findings = ExplorationFindings.load(findings_path)
|
|
51
|
+
|
|
52
|
+
results = {}
|
|
53
|
+
for platform in platforms:
|
|
54
|
+
generator = create_notebook_generator(platform, findings, config)
|
|
55
|
+
platform_dir = str(Path(output_dir) / platform.value)
|
|
56
|
+
saved_paths = generator.save_all(platform_dir)
|
|
57
|
+
results[platform] = saved_paths
|
|
58
|
+
|
|
59
|
+
return results
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def generate_and_validate_notebooks(
|
|
63
|
+
findings_path: Optional[str] = None,
|
|
64
|
+
output_dir: str = "./generated_pipelines",
|
|
65
|
+
platforms: Optional[List[Platform]] = None,
|
|
66
|
+
config: Optional[NotebookConfig] = None,
|
|
67
|
+
) -> Dict[Platform, GenerationResult]:
|
|
68
|
+
if platforms is None:
|
|
69
|
+
platforms = [Platform.LOCAL, Platform.DATABRICKS]
|
|
70
|
+
if config is None:
|
|
71
|
+
config = NotebookConfig()
|
|
72
|
+
|
|
73
|
+
findings = None
|
|
74
|
+
if findings_path:
|
|
75
|
+
from customer_retention.analysis.auto_explorer import ExplorationFindings
|
|
76
|
+
findings = ExplorationFindings.load(findings_path)
|
|
77
|
+
|
|
78
|
+
results = {}
|
|
79
|
+
runner = NotebookRunner(dry_run=True)
|
|
80
|
+
|
|
81
|
+
for platform in platforms:
|
|
82
|
+
generator = create_notebook_generator(platform, findings, config)
|
|
83
|
+
platform_dir = str(Path(output_dir) / platform.value)
|
|
84
|
+
saved_paths = generator.save_all(platform_dir)
|
|
85
|
+
validation_report = runner.validate_sequence(platform_dir, platform.value)
|
|
86
|
+
results[platform] = GenerationResult(platform, saved_paths, validation_report)
|
|
87
|
+
save_validation_report(platform_dir, validation_report)
|
|
88
|
+
|
|
89
|
+
return results
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def save_validation_report(output_dir: str, report: ValidationReport) -> str:
|
|
93
|
+
report_path = Path(output_dir) / "VALIDATION_REPORT.md"
|
|
94
|
+
report_path.write_text(report.to_markdown())
|
|
95
|
+
return str(report_path)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def create_notebook_generator(
|
|
99
|
+
platform: Platform,
|
|
100
|
+
findings: Optional["ExplorationFindings"] = None,
|
|
101
|
+
config: Optional[NotebookConfig] = None,
|
|
102
|
+
) -> NotebookGenerator:
|
|
103
|
+
if config is None:
|
|
104
|
+
config = NotebookConfig()
|
|
105
|
+
|
|
106
|
+
if platform == Platform.LOCAL:
|
|
107
|
+
return LocalNotebookGenerator(config, findings)
|
|
108
|
+
elif platform == Platform.DATABRICKS:
|
|
109
|
+
return DatabricksNotebookGenerator(config, findings)
|
|
110
|
+
else:
|
|
111
|
+
raise ValueError(f"Unsupported platform: {platform}")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def create_script_generator(
|
|
115
|
+
platform: Platform,
|
|
116
|
+
findings: Optional["ExplorationFindings"] = None,
|
|
117
|
+
config: Optional[NotebookConfig] = None,
|
|
118
|
+
) -> ScriptGenerator:
|
|
119
|
+
if config is None:
|
|
120
|
+
config = NotebookConfig()
|
|
121
|
+
|
|
122
|
+
if platform == Platform.LOCAL:
|
|
123
|
+
return LocalScriptGenerator(config, findings)
|
|
124
|
+
elif platform == Platform.DATABRICKS:
|
|
125
|
+
return DatabricksScriptGenerator(config, findings)
|
|
126
|
+
else:
|
|
127
|
+
raise ValueError(f"Unsupported platform: {platform}")
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def generate_orchestration_scripts(
|
|
131
|
+
findings_path: Optional[str] = None,
|
|
132
|
+
output_dir: str = "./generated_pipelines/scripts",
|
|
133
|
+
platforms: Optional[List[Platform]] = None,
|
|
134
|
+
config: Optional[NotebookConfig] = None,
|
|
135
|
+
) -> Dict[Platform, List[str]]:
|
|
136
|
+
if platforms is None:
|
|
137
|
+
platforms = [Platform.LOCAL, Platform.DATABRICKS]
|
|
138
|
+
if config is None:
|
|
139
|
+
config = NotebookConfig()
|
|
140
|
+
|
|
141
|
+
findings = None
|
|
142
|
+
if findings_path:
|
|
143
|
+
from customer_retention.analysis.auto_explorer import ExplorationFindings
|
|
144
|
+
findings = ExplorationFindings.load(findings_path)
|
|
145
|
+
|
|
146
|
+
results = {}
|
|
147
|
+
for platform in platforms:
|
|
148
|
+
generator = create_script_generator(platform, findings, config)
|
|
149
|
+
platform_dir = str(Path(output_dir) / platform.value)
|
|
150
|
+
saved_paths = generator.save_all(platform_dir)
|
|
151
|
+
results[platform] = saved_paths
|
|
152
|
+
|
|
153
|
+
return results
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
__all__ = [
|
|
157
|
+
"NotebookGenerator", "NotebookStage", "NotebookConfig", "Platform",
|
|
158
|
+
"MLflowConfig", "FeatureStoreConfig", "CellBuilder", "OutputFormat",
|
|
159
|
+
"LocalNotebookGenerator", "DatabricksNotebookGenerator",
|
|
160
|
+
"NotebookRunner", "NotebookValidationResult", "ValidationReport", "ScriptRunner",
|
|
161
|
+
"GenerationResult", "generate_orchestration_notebooks",
|
|
162
|
+
"generate_and_validate_notebooks", "create_notebook_generator",
|
|
163
|
+
"validate_generated_notebooks", "save_validation_report",
|
|
164
|
+
"ScriptGenerator", "LocalScriptGenerator", "DatabricksScriptGenerator",
|
|
165
|
+
"create_script_generator", "generate_orchestration_scripts",
|
|
166
|
+
"ProjectInitializer", "initialize_project",
|
|
167
|
+
]
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
import nbformat
|
|
7
|
+
|
|
8
|
+
from .config import NotebookConfig
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from customer_retention.analysis.auto_explorer import ExplorationFindings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class NotebookStage(str, Enum):
|
|
15
|
+
INGESTION = "01_ingestion"
|
|
16
|
+
PROFILING = "02_profiling"
|
|
17
|
+
CLEANING = "03_cleaning"
|
|
18
|
+
TRANSFORMATION = "04_transformation"
|
|
19
|
+
FEATURE_ENGINEERING = "05_feature_engineering"
|
|
20
|
+
FEATURE_SELECTION = "06_feature_selection"
|
|
21
|
+
MODEL_TRAINING = "07_model_training"
|
|
22
|
+
DEPLOYMENT = "08_deployment"
|
|
23
|
+
MONITORING = "09_monitoring"
|
|
24
|
+
BATCH_INFERENCE = "10_batch_inference"
|
|
25
|
+
FEATURE_STORE = "11_feature_store"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class NotebookGenerator(ABC):
|
|
29
|
+
def __init__(self, config: NotebookConfig, findings: Optional["ExplorationFindings"]):
|
|
30
|
+
self.config = config
|
|
31
|
+
self.findings = findings
|
|
32
|
+
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def generate_stage(self, stage: NotebookStage) -> nbformat.NotebookNode:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def available_stages(self) -> List[NotebookStage]:
|
|
39
|
+
if hasattr(self, "stage_generators"):
|
|
40
|
+
return list(self.stage_generators.keys())
|
|
41
|
+
return list(NotebookStage)
|
|
42
|
+
|
|
43
|
+
def generate_all(self) -> Dict[NotebookStage, nbformat.NotebookNode]:
|
|
44
|
+
return {stage: self.generate_stage(stage) for stage in self.available_stages}
|
|
45
|
+
|
|
46
|
+
def save_all(self, output_dir: str) -> List[str]:
|
|
47
|
+
output_path = Path(output_dir)
|
|
48
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
saved_paths = []
|
|
50
|
+
for stage, notebook in self.generate_all().items():
|
|
51
|
+
file_path = output_path / f"{stage.value}.ipynb"
|
|
52
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
53
|
+
nbformat.write(notebook, f)
|
|
54
|
+
saved_paths.append(str(file_path))
|
|
55
|
+
return saved_paths
|