churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,2125 @@
|
|
|
1
|
+
from collections import OrderedDict, namedtuple
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List, Tuple
|
|
4
|
+
|
|
5
|
+
from jinja2 import BaseLoader, Environment
|
|
6
|
+
|
|
7
|
+
from .models import (
|
|
8
|
+
BronzeEventConfig,
|
|
9
|
+
BronzeLayerConfig,
|
|
10
|
+
LandingLayerConfig,
|
|
11
|
+
PipelineConfig,
|
|
12
|
+
PipelineTransformationType,
|
|
13
|
+
TransformationStep,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
SECTION_MAP = {
|
|
17
|
+
PipelineTransformationType.IMPUTE_NULL: "Missing Value Analysis",
|
|
18
|
+
PipelineTransformationType.DROP_COLUMN: "Missing Value Analysis",
|
|
19
|
+
PipelineTransformationType.CAP_OUTLIER: "Global Outlier Detection",
|
|
20
|
+
PipelineTransformationType.WINSORIZE: "Global Outlier Detection",
|
|
21
|
+
PipelineTransformationType.SEGMENT_AWARE_CAP: "Segment-Aware Outlier Analysis",
|
|
22
|
+
PipelineTransformationType.LOG_TRANSFORM: "Feature Distributions",
|
|
23
|
+
PipelineTransformationType.SQRT_TRANSFORM: "Feature Distributions",
|
|
24
|
+
PipelineTransformationType.YEO_JOHNSON: "Feature Distributions",
|
|
25
|
+
PipelineTransformationType.CAP_THEN_LOG: "Feature Distributions",
|
|
26
|
+
PipelineTransformationType.ZERO_INFLATION_HANDLING: "Feature Distributions",
|
|
27
|
+
PipelineTransformationType.ENCODE: "Categorical Feature Analysis",
|
|
28
|
+
PipelineTransformationType.SCALE: "Feature-Target Correlations",
|
|
29
|
+
PipelineTransformationType.FEATURE_SELECT: "Feature Selection Recommendations",
|
|
30
|
+
PipelineTransformationType.DERIVED_COLUMN: "Feature Engineering Recommendations",
|
|
31
|
+
PipelineTransformationType.TYPE_CAST: "Data Consistency Checks",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
ANCHOR_MAP = {
|
|
35
|
+
PipelineTransformationType.IMPUTE_NULL: "3.5-Missing-Value-Analysis",
|
|
36
|
+
PipelineTransformationType.DROP_COLUMN: "3.5-Missing-Value-Analysis",
|
|
37
|
+
PipelineTransformationType.CAP_OUTLIER: "3.8-Global-Outlier-Detection",
|
|
38
|
+
PipelineTransformationType.WINSORIZE: "3.8-Global-Outlier-Detection",
|
|
39
|
+
PipelineTransformationType.SEGMENT_AWARE_CAP: "3.7-Segment-Aware-Outlier-Analysis",
|
|
40
|
+
PipelineTransformationType.LOG_TRANSFORM: "4.4-Feature-Distributions-by-Retention-Status",
|
|
41
|
+
PipelineTransformationType.SQRT_TRANSFORM: "4.4-Feature-Distributions-by-Retention-Status",
|
|
42
|
+
PipelineTransformationType.YEO_JOHNSON: "4.4-Feature-Distributions-by-Retention-Status",
|
|
43
|
+
PipelineTransformationType.CAP_THEN_LOG: "4.4-Feature-Distributions-by-Retention-Status",
|
|
44
|
+
PipelineTransformationType.ZERO_INFLATION_HANDLING: "4.4-Feature-Distributions-by-Retention-Status",
|
|
45
|
+
PipelineTransformationType.ENCODE: "4.6-Categorical-Feature-Analysis",
|
|
46
|
+
PipelineTransformationType.SCALE: "4.5-Feature-Target-Correlations",
|
|
47
|
+
PipelineTransformationType.FEATURE_SELECT: "4.9.1-Feature-Selection-Recommendations",
|
|
48
|
+
PipelineTransformationType.DERIVED_COLUMN: "4.9.4-Feature-Engineering-Recommendations",
|
|
49
|
+
PipelineTransformationType.TYPE_CAST: "3.11-Data-Consistency-Checks",
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
DEFAULT_NOTEBOOK_MAP = {
|
|
53
|
+
PipelineTransformationType.IMPUTE_NULL: "03_quality_assessment",
|
|
54
|
+
PipelineTransformationType.DROP_COLUMN: "03_quality_assessment",
|
|
55
|
+
PipelineTransformationType.CAP_OUTLIER: "03_quality_assessment",
|
|
56
|
+
PipelineTransformationType.WINSORIZE: "03_quality_assessment",
|
|
57
|
+
PipelineTransformationType.SEGMENT_AWARE_CAP: "03_quality_assessment",
|
|
58
|
+
PipelineTransformationType.TYPE_CAST: "03_quality_assessment",
|
|
59
|
+
PipelineTransformationType.LOG_TRANSFORM: "04_relationship_analysis",
|
|
60
|
+
PipelineTransformationType.SQRT_TRANSFORM: "04_relationship_analysis",
|
|
61
|
+
PipelineTransformationType.YEO_JOHNSON: "04_relationship_analysis",
|
|
62
|
+
PipelineTransformationType.CAP_THEN_LOG: "04_relationship_analysis",
|
|
63
|
+
PipelineTransformationType.ZERO_INFLATION_HANDLING: "04_relationship_analysis",
|
|
64
|
+
PipelineTransformationType.ENCODE: "04_relationship_analysis",
|
|
65
|
+
PipelineTransformationType.SCALE: "04_relationship_analysis",
|
|
66
|
+
PipelineTransformationType.FEATURE_SELECT: "04_relationship_analysis",
|
|
67
|
+
PipelineTransformationType.DERIVED_COLUMN: "04_relationship_analysis",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
_docs_base: str = "docs"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _notebook_title(notebook: str) -> str:
|
|
75
|
+
name = notebook.split("_", 1)[1] if "_" in notebook else notebook
|
|
76
|
+
return name.replace("_", " ").title()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def provenance_docstring(step: TransformationStep) -> str:
|
|
80
|
+
notebook = step.source_notebook or DEFAULT_NOTEBOOK_MAP.get(step.type)
|
|
81
|
+
if not notebook:
|
|
82
|
+
return ""
|
|
83
|
+
title = _notebook_title(notebook)
|
|
84
|
+
anchor = ANCHOR_MAP.get(step.type)
|
|
85
|
+
section = SECTION_MAP.get(step.type)
|
|
86
|
+
base = _docs_base
|
|
87
|
+
if anchor:
|
|
88
|
+
return f"{title} {section}\n {base}/{notebook}.html#{anchor}"
|
|
89
|
+
return f"{title}\n {base}/{notebook}.html"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def provenance_docstring_block(steps) -> str:
|
|
93
|
+
seen = set()
|
|
94
|
+
entries = []
|
|
95
|
+
for step in steps:
|
|
96
|
+
key = provenance_key(step)
|
|
97
|
+
if not key or key in seen:
|
|
98
|
+
continue
|
|
99
|
+
seen.add(key)
|
|
100
|
+
entry = provenance_docstring(step)
|
|
101
|
+
if entry:
|
|
102
|
+
entries.append(entry)
|
|
103
|
+
if not entries:
|
|
104
|
+
return ""
|
|
105
|
+
body = "\n ".join(entries)
|
|
106
|
+
return f' """\n {body}\n """'
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def provenance_key(step: TransformationStep) -> str:
|
|
110
|
+
notebook = step.source_notebook or DEFAULT_NOTEBOOK_MAP.get(step.type)
|
|
111
|
+
section = SECTION_MAP.get(step.type, "")
|
|
112
|
+
return f"{notebook}:{section}" if notebook else ""
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class StepGrouper:
|
|
116
|
+
|
|
117
|
+
_TYPE_TO_FUNC = {
|
|
118
|
+
PipelineTransformationType.DROP_COLUMN: "drop_unusable_columns",
|
|
119
|
+
PipelineTransformationType.IMPUTE_NULL: "impute_remaining_nulls",
|
|
120
|
+
PipelineTransformationType.CAP_OUTLIER: "cap_outliers",
|
|
121
|
+
PipelineTransformationType.TYPE_CAST: "apply_type_casts",
|
|
122
|
+
PipelineTransformationType.WINSORIZE: "winsorize_outliers",
|
|
123
|
+
PipelineTransformationType.SEGMENT_AWARE_CAP: "cap_segment_aware_outliers",
|
|
124
|
+
PipelineTransformationType.LOG_TRANSFORM: "apply_log_transforms",
|
|
125
|
+
PipelineTransformationType.SQRT_TRANSFORM: "apply_sqrt_transforms",
|
|
126
|
+
PipelineTransformationType.ZERO_INFLATION_HANDLING: "handle_zero_inflation",
|
|
127
|
+
PipelineTransformationType.CAP_THEN_LOG: "apply_cap_then_log_transforms",
|
|
128
|
+
PipelineTransformationType.YEO_JOHNSON: "apply_power_transforms",
|
|
129
|
+
PipelineTransformationType.FEATURE_SELECT: "apply_feature_selection",
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
_DERIVED_ACTION_TO_FUNC = {
|
|
133
|
+
"ratio": "create_ratio_features",
|
|
134
|
+
"interaction": "create_interaction_features",
|
|
135
|
+
"composite": "create_composite_features",
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
@classmethod
|
|
139
|
+
def group(cls, steps: List[TransformationStep]) -> List[Tuple[str, List[TransformationStep]]]:
|
|
140
|
+
if not steps:
|
|
141
|
+
return []
|
|
142
|
+
groups: OrderedDict[str, List[TransformationStep]] = OrderedDict()
|
|
143
|
+
for step in steps:
|
|
144
|
+
groups.setdefault(cls._func_name(step), []).append(step)
|
|
145
|
+
return list(groups.items())
|
|
146
|
+
|
|
147
|
+
@classmethod
|
|
148
|
+
def _func_name(cls, step: TransformationStep) -> str:
|
|
149
|
+
if step.type == PipelineTransformationType.DERIVED_COLUMN:
|
|
150
|
+
action = step.parameters.get("action", "ratio")
|
|
151
|
+
return cls._DERIVED_ACTION_TO_FUNC.get(action, f"create_{action}_features")
|
|
152
|
+
return cls._TYPE_TO_FUNC.get(step.type, f"apply_{step.type.value}")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
group_steps = StepGrouper.group
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class InlineLoader(BaseLoader):
|
|
159
|
+
def __init__(self, templates: dict):
|
|
160
|
+
self._templates = templates
|
|
161
|
+
|
|
162
|
+
def get_source(self, environment, template):
|
|
163
|
+
if template in self._templates:
|
|
164
|
+
return self._templates[template], template, lambda: True
|
|
165
|
+
raise Exception(f"Template {template} not found")
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
TEMPLATES = {
|
|
169
|
+
"config.py.j2": """import os
|
|
170
|
+
from pathlib import Path
|
|
171
|
+
|
|
172
|
+
PIPELINE_NAME = "{{ config.name }}"
|
|
173
|
+
TARGET_COLUMN = "{{ config.target_column }}"
|
|
174
|
+
OUTPUT_DIR = Path("{{ config.output_dir }}")
|
|
175
|
+
|
|
176
|
+
# Iteration tracking
|
|
177
|
+
ITERATION_ID = {{ '"%s"' % config.iteration_id if config.iteration_id else 'None' }}
|
|
178
|
+
PARENT_ITERATION_ID = {{ '"%s"' % config.parent_iteration_id if config.parent_iteration_id else 'None' }}
|
|
179
|
+
|
|
180
|
+
# Recommendations hash for experiment tracking
|
|
181
|
+
RECOMMENDATIONS_HASH = {{ '"%s"' % config.recommendations_hash if config.recommendations_hash else 'None' }}
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _find_project_root():
|
|
185
|
+
path = Path(__file__).parent
|
|
186
|
+
for _ in range(10):
|
|
187
|
+
if (path / "pyproject.toml").exists() or (path / ".git").exists():
|
|
188
|
+
return path
|
|
189
|
+
path = path.parent
|
|
190
|
+
return Path(__file__).parent
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
PROJECT_ROOT = _find_project_root()
|
|
194
|
+
|
|
195
|
+
# Experiments directory - all artifacts (data, mlruns, feast) go here
|
|
196
|
+
# Override with CR_EXPERIMENTS_DIR environment variable for Databricks/custom locations
|
|
197
|
+
_default_experiments = {{ '"%s"' % config.experiments_dir if config.experiments_dir else '"experiments"' }}
|
|
198
|
+
EXPERIMENTS_DIR = Path(os.environ.get("CR_EXPERIMENTS_DIR", str(PROJECT_ROOT / _default_experiments)))
|
|
199
|
+
|
|
200
|
+
# Documentation base URL for provenance links in generated code
|
|
201
|
+
# Local: file:// URI to HTML docs (from export_tutorial_html.py)
|
|
202
|
+
# Databricks: set to workspace notebook path for exploration report
|
|
203
|
+
DOCS_BASE_URL = os.environ.get("CR_DOCS_BASE_URL", str(EXPERIMENTS_DIR / "docs"))
|
|
204
|
+
|
|
205
|
+
# Production output directory - all pipeline writes go here
|
|
206
|
+
# Override with CR_PRODUCTION_DIR environment variable
|
|
207
|
+
_default_production = {{ '"%s"' % config.production_dir if config.production_dir else 'str(EXPERIMENTS_DIR)' }}
|
|
208
|
+
PRODUCTION_DIR = Path(os.environ.get("CR_PRODUCTION_DIR", _default_production))
|
|
209
|
+
|
|
210
|
+
# MLflow tracking - using SQLite backend (recommended over deprecated file-based backend)
|
|
211
|
+
MLFLOW_TRACKING_URI = os.environ.get("MLFLOW_TRACKING_URI", f"sqlite:///{EXPERIMENTS_DIR / 'mlruns.db'}")
|
|
212
|
+
MLFLOW_ARTIFACT_ROOT = str(EXPERIMENTS_DIR / "mlruns" / "artifacts")
|
|
213
|
+
|
|
214
|
+
# Feast feature store configuration - stored in experiments directory
|
|
215
|
+
FEAST_REPO_PATH = str(PRODUCTION_DIR / "feature_repo")
|
|
216
|
+
FEAST_FEATURE_VIEW = "{{ config.feast.feature_view_name if config.feast else config.name + '_features' }}"
|
|
217
|
+
FEAST_ENTITY_NAME = "{{ config.feast.entity_name if config.feast else 'customer' }}"
|
|
218
|
+
FEAST_ENTITY_KEY = "{{ config.feast.entity_key if config.feast else config.sources[0].entity_key }}"
|
|
219
|
+
FEAST_TIMESTAMP_COL = "{{ config.feast.timestamp_column if config.feast else 'event_timestamp' }}"
|
|
220
|
+
FEAST_TTL_DAYS = {{ config.feast.ttl_days if config.feast else 365 }}
|
|
221
|
+
|
|
222
|
+
# Source paths - findings directory is a subfolder of experiments
|
|
223
|
+
FINDINGS_DIR = EXPERIMENTS_DIR / "findings"
|
|
224
|
+
|
|
225
|
+
SOURCES = {
|
|
226
|
+
{% for source in config.sources %}
|
|
227
|
+
"{{ source.name }}": {
|
|
228
|
+
"path": str(FINDINGS_DIR / "{{ source.path }}"),
|
|
229
|
+
"format": "{{ source.format }}",
|
|
230
|
+
"entity_key": "{{ source.entity_key }}",
|
|
231
|
+
{% if source.time_column %}
|
|
232
|
+
"time_column": "{{ source.time_column }}",
|
|
233
|
+
{% endif %}
|
|
234
|
+
"is_event_level": {{ source.is_event_level }},
|
|
235
|
+
},
|
|
236
|
+
{% endfor %}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def get_bronze_path(source_name: str) -> Path:
|
|
241
|
+
return PRODUCTION_DIR / "data" / "bronze" / f"{source_name}.parquet"
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def get_silver_path() -> Path:
|
|
245
|
+
return PRODUCTION_DIR / "data" / "silver" / "merged.parquet"
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def get_gold_path() -> Path:
|
|
249
|
+
return PRODUCTION_DIR / "data" / "gold" / "features.parquet"
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def get_feast_data_path() -> Path:
|
|
253
|
+
return Path(FEAST_REPO_PATH) / "data" / f"{FEAST_FEATURE_VIEW}.parquet"
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
# Fit mode configuration for training vs scoring separation
|
|
257
|
+
FIT_MODE = {{ 'True' if config.fit_mode else 'False' }}
|
|
258
|
+
ARTIFACTS_PATH = {{ '"%s"' % config.artifacts_path if config.artifacts_path else 'str(PRODUCTION_DIR / "artifacts" / (RECOMMENDATIONS_HASH or "default"))' }}
|
|
259
|
+
|
|
260
|
+
RAW_SOURCES = {
|
|
261
|
+
{% for name, landing in config.landing.items() %}
|
|
262
|
+
"{{ name }}": {
|
|
263
|
+
"path": "{{ landing.raw_source_path }}",
|
|
264
|
+
"format": "{{ landing.raw_source_format }}",
|
|
265
|
+
"entity_key": "{{ landing.entity_column }}",
|
|
266
|
+
"time_column": "{{ landing.time_column }}",
|
|
267
|
+
},
|
|
268
|
+
{% endfor %}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
EXCLUDED_SOURCES = [
|
|
272
|
+
{% for source in config.sources %}
|
|
273
|
+
{% if source.excluded %}
|
|
274
|
+
"{{ source.name }}",
|
|
275
|
+
{% endif %}
|
|
276
|
+
{% endfor %}
|
|
277
|
+
]
|
|
278
|
+
|
|
279
|
+
EXPLORATION_ARTIFACTS = {
|
|
280
|
+
"bronze": {name: str(EXPERIMENTS_DIR / "data" / "bronze" / f"{name}.parquet") for name in SOURCES},
|
|
281
|
+
"silver": str(EXPERIMENTS_DIR / "data" / "silver" / "merged.parquet"),
|
|
282
|
+
"gold": str(EXPERIMENTS_DIR / "data" / "gold" / "features.parquet"),
|
|
283
|
+
"scoring": str(EXPERIMENTS_DIR / "data" / "scoring" / "predictions.parquet"),
|
|
284
|
+
}
|
|
285
|
+
""",
|
|
286
|
+
"bronze.py.j2": """import pandas as pd
|
|
287
|
+
import numpy as np
|
|
288
|
+
from pathlib import Path
|
|
289
|
+
{% set ops, fitted = collect_imports(config.transformations, False) %}
|
|
290
|
+
{% if ops %}
|
|
291
|
+
from customer_retention.transforms import {{ ops | sort | join(', ') }}
|
|
292
|
+
{% endif %}
|
|
293
|
+
from config import SOURCES, get_bronze_path{{ ', RAW_SOURCES' if config.lifecycle else '' }}
|
|
294
|
+
|
|
295
|
+
SOURCE_NAME = "{{ source }}"
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def load_{{ source }}():
|
|
299
|
+
source_config = SOURCES[SOURCE_NAME]
|
|
300
|
+
path = Path(source_config["path"])
|
|
301
|
+
if path.is_dir() and (path / "_delta_log").is_dir():
|
|
302
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
303
|
+
return get_delta(force_local=True).read(str(path))
|
|
304
|
+
if not path.exists():
|
|
305
|
+
raise FileNotFoundError(f"Source file not found: {path}")
|
|
306
|
+
if source_config["format"] == "csv":
|
|
307
|
+
return pd.read_csv(path)
|
|
308
|
+
return pd.read_parquet(path)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
{% set groups = group_steps(config.transformations) %}
|
|
312
|
+
|
|
313
|
+
def apply_transformations(df: pd.DataFrame) -> pd.DataFrame:
|
|
314
|
+
{%- if groups %}
|
|
315
|
+
{%- for func_name, steps in groups %}
|
|
316
|
+
df = {{ func_name }}(df)
|
|
317
|
+
{%- endfor %}
|
|
318
|
+
{%- endif %}
|
|
319
|
+
return df
|
|
320
|
+
|
|
321
|
+
{% for func_name, steps in groups %}
|
|
322
|
+
|
|
323
|
+
def {{ func_name }}(df: pd.DataFrame) -> pd.DataFrame:
|
|
324
|
+
{%- set _prov = provenance_docstring_block(steps) %}
|
|
325
|
+
{%- if _prov %}
|
|
326
|
+
{{ _prov }}
|
|
327
|
+
{%- endif %}
|
|
328
|
+
{%- for t in steps %}
|
|
329
|
+
# {{ t.rationale }}
|
|
330
|
+
# {{ action_description(t) }}
|
|
331
|
+
df = {{ render_step_call(t) }}
|
|
332
|
+
{%- endfor %}
|
|
333
|
+
return df
|
|
334
|
+
{% endfor %}
|
|
335
|
+
|
|
336
|
+
{% if config.lifecycle %}
|
|
337
|
+
|
|
338
|
+
# --- Lifecycle enrichment (computed on cleaned data) ---
|
|
339
|
+
|
|
340
|
+
ENTITY_COLUMN = "{{ config.entity_column or config.source.entity_key }}"
|
|
341
|
+
TIME_COLUMN = "{{ config.time_column or config.source.time_column }}"
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _load_raw_events():
|
|
345
|
+
source = RAW_SOURCES[SOURCE_NAME]
|
|
346
|
+
path = Path(source["path"])
|
|
347
|
+
if path.is_dir() and (path / "_delta_log").is_dir():
|
|
348
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
349
|
+
return get_delta(force_local=True).read(str(path))
|
|
350
|
+
if not path.exists():
|
|
351
|
+
raise FileNotFoundError(f"Raw source not found: {path}")
|
|
352
|
+
if source["format"] == "csv":
|
|
353
|
+
return pd.read_csv(path)
|
|
354
|
+
return pd.read_parquet(path)
|
|
355
|
+
|
|
356
|
+
{% if config.lifecycle.include_recency_bucket %}
|
|
357
|
+
|
|
358
|
+
def add_recency_tenure(df: pd.DataFrame, raw_df: pd.DataFrame) -> pd.DataFrame:
|
|
359
|
+
raw_df[TIME_COLUMN] = pd.to_datetime(raw_df[TIME_COLUMN])
|
|
360
|
+
reference_date = raw_df[TIME_COLUMN].max()
|
|
361
|
+
entity_stats = raw_df.groupby(ENTITY_COLUMN)[TIME_COLUMN].agg(["min", "max"])
|
|
362
|
+
entity_stats["days_since_last"] = (reference_date - entity_stats["max"]).dt.days
|
|
363
|
+
entity_stats["days_since_first"] = (reference_date - entity_stats["min"]).dt.days
|
|
364
|
+
df = df.merge(entity_stats[["days_since_last", "days_since_first"]], left_on=ENTITY_COLUMN, right_index=True, how="left")
|
|
365
|
+
return df
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def add_recency_buckets(df: pd.DataFrame) -> pd.DataFrame:
|
|
369
|
+
if "days_since_last" in df.columns:
|
|
370
|
+
df["recency_bucket"] = pd.cut(df["days_since_last"], bins=[0, 7, 30, 90, 180, 365, float("inf")],
|
|
371
|
+
labels=["0-7d", "7-30d", "30-90d", "90-180d", "180-365d", "365d+"])
|
|
372
|
+
return df
|
|
373
|
+
|
|
374
|
+
{% endif %}
|
|
375
|
+
{% if config.lifecycle.include_lifecycle_quadrant %}
|
|
376
|
+
|
|
377
|
+
def add_lifecycle_quadrant(df: pd.DataFrame) -> pd.DataFrame:
|
|
378
|
+
if "days_since_first" not in df.columns:
|
|
379
|
+
return df
|
|
380
|
+
tenure = df["days_since_first"]
|
|
381
|
+
intensity_col = [c for c in df.columns if c.startswith("event_count_")]
|
|
382
|
+
if not intensity_col:
|
|
383
|
+
return df
|
|
384
|
+
intensity = df[intensity_col[0]]
|
|
385
|
+
tenure_med = tenure.median()
|
|
386
|
+
intensity_med = intensity.median()
|
|
387
|
+
conditions = [
|
|
388
|
+
(tenure >= tenure_med) & (intensity >= intensity_med),
|
|
389
|
+
(tenure >= tenure_med) & (intensity < intensity_med),
|
|
390
|
+
(tenure < tenure_med) & (intensity >= intensity_med),
|
|
391
|
+
(tenure < tenure_med) & (intensity < intensity_med),
|
|
392
|
+
]
|
|
393
|
+
labels = ["loyal", "at_risk", "new_active", "new_inactive"]
|
|
394
|
+
df["lifecycle_quadrant"] = np.select(conditions, labels, default="unknown")
|
|
395
|
+
return df
|
|
396
|
+
|
|
397
|
+
{% endif %}
|
|
398
|
+
{% if config.lifecycle.include_cyclical_features %}
|
|
399
|
+
|
|
400
|
+
def add_cyclical_features(df: pd.DataFrame, raw_df: pd.DataFrame) -> pd.DataFrame:
|
|
401
|
+
raw_df[TIME_COLUMN] = pd.to_datetime(raw_df[TIME_COLUMN])
|
|
402
|
+
mean_dow = raw_df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.dayofweek.mean())
|
|
403
|
+
df = df.merge(mean_dow.rename("mean_dow"), left_on=ENTITY_COLUMN, right_index=True, how="left")
|
|
404
|
+
df["dow_sin"] = np.sin(2 * np.pi * df["mean_dow"] / 7)
|
|
405
|
+
df["dow_cos"] = np.cos(2 * np.pi * df["mean_dow"] / 7)
|
|
406
|
+
df = df.drop(columns=["mean_dow"], errors="ignore")
|
|
407
|
+
return df
|
|
408
|
+
|
|
409
|
+
{% endif %}
|
|
410
|
+
{% if config.lifecycle.momentum_pairs %}
|
|
411
|
+
|
|
412
|
+
def add_momentum_ratios(df: pd.DataFrame) -> pd.DataFrame:
|
|
413
|
+
{% for pair in config.lifecycle.momentum_pairs %}
|
|
414
|
+
short_col = "event_count_{{ pair.short_window }}"
|
|
415
|
+
long_col = "event_count_{{ pair.long_window }}"
|
|
416
|
+
if short_col in df.columns and long_col in df.columns:
|
|
417
|
+
df["momentum_{{ pair.short_window }}_{{ pair.long_window }}"] = df[short_col] / df[long_col].replace(0, float("nan"))
|
|
418
|
+
{% endfor %}
|
|
419
|
+
return df
|
|
420
|
+
|
|
421
|
+
{% endif %}
|
|
422
|
+
|
|
423
|
+
def enrich_lifecycle(df: pd.DataFrame) -> pd.DataFrame:
|
|
424
|
+
raw_df = _load_raw_events()
|
|
425
|
+
{% if config.raw_time_column %}
|
|
426
|
+
raw_df = raw_df.rename(columns={"{{ config.raw_time_column }}": TIME_COLUMN})
|
|
427
|
+
{% endif %}
|
|
428
|
+
{% if config.lifecycle.include_recency_bucket %}
|
|
429
|
+
df = add_recency_tenure(df, raw_df)
|
|
430
|
+
df = add_recency_buckets(df)
|
|
431
|
+
{% endif %}
|
|
432
|
+
{% if config.lifecycle.include_lifecycle_quadrant %}
|
|
433
|
+
df = add_lifecycle_quadrant(df)
|
|
434
|
+
{% endif %}
|
|
435
|
+
{% if config.lifecycle.include_cyclical_features %}
|
|
436
|
+
df = add_cyclical_features(df, raw_df)
|
|
437
|
+
{% endif %}
|
|
438
|
+
{% if config.lifecycle.momentum_pairs %}
|
|
439
|
+
df = add_momentum_ratios(df)
|
|
440
|
+
{% endif %}
|
|
441
|
+
return df
|
|
442
|
+
{% endif %}
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def run_bronze_{{ source }}():
|
|
446
|
+
df = load_{{ source }}()
|
|
447
|
+
df = apply_transformations(df)
|
|
448
|
+
{% if config.lifecycle %}
|
|
449
|
+
df = enrich_lifecycle(df)
|
|
450
|
+
{% endif %}
|
|
451
|
+
output_path = get_bronze_path(SOURCE_NAME)
|
|
452
|
+
output_dir = output_path.parent
|
|
453
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
454
|
+
try:
|
|
455
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
456
|
+
storage = get_delta(force_local=True)
|
|
457
|
+
storage.write(df, str(output_dir / SOURCE_NAME))
|
|
458
|
+
except ImportError:
|
|
459
|
+
df.to_parquet(output_path, index=False)
|
|
460
|
+
return df
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
if __name__ == "__main__":
|
|
464
|
+
run_bronze_{{ source }}()
|
|
465
|
+
""",
|
|
466
|
+
"silver.py.j2": '''import pandas as pd
|
|
467
|
+
{% set ops, fitted = collect_imports(config.silver.derived_columns, False) %}
|
|
468
|
+
{% if ops %}
|
|
469
|
+
from customer_retention.transforms import {{ ops | sort | join(', ') }}
|
|
470
|
+
{% endif %}
|
|
471
|
+
from config import SOURCES, get_bronze_path, get_silver_path, TARGET_COLUMN
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def _load_artifact(path):
|
|
475
|
+
from pathlib import Path as _P
|
|
476
|
+
p = _P(path)
|
|
477
|
+
if p.parent.is_dir() and (p.parent / p.stem / "_delta_log").is_dir():
|
|
478
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
479
|
+
return get_delta(force_local=True).read(str(p.parent / p.stem))
|
|
480
|
+
return pd.read_parquet(path)
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def load_bronze_outputs() -> dict:
|
|
484
|
+
return {name: _load_artifact(get_bronze_path(name))
|
|
485
|
+
for name in SOURCES.keys() if not SOURCES[name].get("excluded")}
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def merge_sources(bronze_outputs: dict) -> pd.DataFrame:
|
|
489
|
+
base_source = "{{ config.sources[0].name }}"
|
|
490
|
+
merged = bronze_outputs[base_source]
|
|
491
|
+
{% for join in config.silver.joins %}
|
|
492
|
+
merged = merged.merge(
|
|
493
|
+
bronze_outputs["{{ join.right_source }}"],
|
|
494
|
+
left_on="{{ join.left_key }}",
|
|
495
|
+
right_on="{{ join.right_key }}",
|
|
496
|
+
how="{{ join.how }}"
|
|
497
|
+
)
|
|
498
|
+
{% endfor %}
|
|
499
|
+
return merged
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def create_holdout_mask(df: pd.DataFrame, holdout_fraction: float = 0.1, random_state: int = 42) -> pd.DataFrame:
|
|
503
|
+
"""Create holdout set by masking target for a fraction of records.
|
|
504
|
+
|
|
505
|
+
IMPORTANT: This must happen in the silver layer (BEFORE gold layer feature computation)
|
|
506
|
+
to prevent temporal leakage. If holdout is created after features are computed,
|
|
507
|
+
the features may contain information derived from the target values that will be masked.
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
df: DataFrame with TARGET_COLUMN
|
|
511
|
+
holdout_fraction: Fraction of records to use for holdout (default 10%)
|
|
512
|
+
random_state: Random seed for reproducibility
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
DataFrame with holdout mask applied (original values stored in original_{TARGET_COLUMN})
|
|
516
|
+
"""
|
|
517
|
+
ORIGINAL_COLUMN = f"original_{TARGET_COLUMN}"
|
|
518
|
+
|
|
519
|
+
# Skip if holdout already exists
|
|
520
|
+
if ORIGINAL_COLUMN in df.columns:
|
|
521
|
+
print(f" Holdout already exists ({ORIGINAL_COLUMN}), skipping creation")
|
|
522
|
+
return df
|
|
523
|
+
|
|
524
|
+
if TARGET_COLUMN not in df.columns:
|
|
525
|
+
print(f" Warning: TARGET_COLUMN \\'{TARGET_COLUMN}\\' not found, skipping holdout creation")
|
|
526
|
+
return df
|
|
527
|
+
|
|
528
|
+
print(f"Creating holdout set ({holdout_fraction:.0%} of data)...")
|
|
529
|
+
df = df.copy()
|
|
530
|
+
|
|
531
|
+
n_holdout = int(len(df) * holdout_fraction)
|
|
532
|
+
holdout_idx = df.sample(n=n_holdout, random_state=random_state).index
|
|
533
|
+
|
|
534
|
+
# Store original values for holdout records only
|
|
535
|
+
df[ORIGINAL_COLUMN] = pd.NA
|
|
536
|
+
df.loc[holdout_idx, ORIGINAL_COLUMN] = df.loc[holdout_idx, TARGET_COLUMN]
|
|
537
|
+
|
|
538
|
+
# Mask target values for holdout records
|
|
539
|
+
df.loc[holdout_idx, TARGET_COLUMN] = pd.NA
|
|
540
|
+
|
|
541
|
+
print(f" Holdout records: {n_holdout:,} ({holdout_fraction:.0%})")
|
|
542
|
+
print(f" Training records: {len(df) - n_holdout:,} ({1-holdout_fraction:.0%})")
|
|
543
|
+
|
|
544
|
+
return df
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
{% set derived_groups = group_steps(config.silver.derived_columns) %}
|
|
548
|
+
|
|
549
|
+
def create_derived_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
550
|
+
{%- if derived_groups %}
|
|
551
|
+
{%- for func_name, steps in derived_groups %}
|
|
552
|
+
df = {{ func_name }}(df)
|
|
553
|
+
{%- endfor %}
|
|
554
|
+
{%- endif %}
|
|
555
|
+
return df
|
|
556
|
+
|
|
557
|
+
{% for func_name, steps in derived_groups %}
|
|
558
|
+
|
|
559
|
+
def {{ func_name }}(df: pd.DataFrame) -> pd.DataFrame:
|
|
560
|
+
{%- set _prov = provenance_docstring_block(steps) %}
|
|
561
|
+
{%- if _prov %}
|
|
562
|
+
{{ _prov }}
|
|
563
|
+
{%- endif %}
|
|
564
|
+
{%- for dc in steps %}
|
|
565
|
+
# {{ dc.rationale }}
|
|
566
|
+
# {{ action_description(dc) }}
|
|
567
|
+
df = {{ render_step_call(dc) }}
|
|
568
|
+
{%- endfor %}
|
|
569
|
+
return df
|
|
570
|
+
{% endfor %}
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def run_silver_merge(create_holdout: bool = True, holdout_fraction: float = 0.1):
|
|
574
|
+
bronze_outputs = load_bronze_outputs()
|
|
575
|
+
silver = merge_sources(bronze_outputs)
|
|
576
|
+
silver = create_derived_columns(silver)
|
|
577
|
+
|
|
578
|
+
if create_holdout:
|
|
579
|
+
silver = create_holdout_mask(silver, holdout_fraction=holdout_fraction)
|
|
580
|
+
|
|
581
|
+
output_path = get_silver_path()
|
|
582
|
+
output_dir = output_path.parent
|
|
583
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
584
|
+
try:
|
|
585
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
586
|
+
storage = get_delta(force_local=True)
|
|
587
|
+
storage.write(silver, str(output_dir / "silver"))
|
|
588
|
+
except ImportError:
|
|
589
|
+
silver.to_parquet(output_path, index=False)
|
|
590
|
+
return silver
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
if __name__ == "__main__":
|
|
594
|
+
run_silver_merge()
|
|
595
|
+
''',
|
|
596
|
+
"gold.py.j2": '''import pandas as pd
|
|
597
|
+
import warnings
|
|
598
|
+
from datetime import datetime
|
|
599
|
+
from pathlib import Path
|
|
600
|
+
{% set all_gold_steps = config.gold.transformations + config.gold.encodings + config.gold.scalings %}
|
|
601
|
+
{% set ops, fitted = collect_imports(all_gold_steps, True) %}
|
|
602
|
+
{% set fs_ops = ['apply_feature_select'] if config.gold.feature_selections else [] %}
|
|
603
|
+
from customer_retention.transforms import ArtifactStore{{ (', ' + (ops | sort | join(', '))) if ops }}{{ (', ' + (fs_ops | join(', '))) if fs_ops and 'apply_feature_select' not in ops }}
|
|
604
|
+
{% if fitted %}
|
|
605
|
+
from customer_retention.transforms.fitted import {{ fitted | sort | join(', ') }}
|
|
606
|
+
{% endif %}
|
|
607
|
+
from config import (get_silver_path, get_gold_path, get_feast_data_path,
|
|
608
|
+
TARGET_COLUMN, RECOMMENDATIONS_HASH, FEAST_REPO_PATH,
|
|
609
|
+
FEAST_FEATURE_VIEW, FEAST_ENTITY_KEY, FEAST_TIMESTAMP_COL, EXPERIMENTS_DIR,
|
|
610
|
+
ARTIFACTS_PATH, FIT_MODE)
|
|
611
|
+
|
|
612
|
+
{% if config.fit_mode %}
|
|
613
|
+
_store = ArtifactStore(Path(ARTIFACTS_PATH))
|
|
614
|
+
{% else %}
|
|
615
|
+
_store = ArtifactStore.from_manifest(Path(ARTIFACTS_PATH) / "manifest.yaml")
|
|
616
|
+
{% endif %}
|
|
617
|
+
|
|
618
|
+
from customer_retention.generators.pipeline_generator.models import (
|
|
619
|
+
PipelineTransformationType,
|
|
620
|
+
TransformationStep,
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
ENCODINGS = [
|
|
624
|
+
{% for enc in config.gold.encodings %}
|
|
625
|
+
TransformationStep(type=PipelineTransformationType.ENCODE, column="{{ enc.column }}", parameters={{ enc.parameters }}, rationale="{{ enc.rationale }}"),
|
|
626
|
+
{% endfor %}
|
|
627
|
+
]
|
|
628
|
+
|
|
629
|
+
SCALINGS = [
|
|
630
|
+
{% for scale in config.gold.scalings %}
|
|
631
|
+
TransformationStep(type=PipelineTransformationType.SCALE, column="{{ scale.column }}", parameters={{ scale.parameters }}, rationale="{{ scale.rationale }}"),
|
|
632
|
+
{% endfor %}
|
|
633
|
+
]
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def load_silver() -> pd.DataFrame:
|
|
637
|
+
path = get_silver_path()
|
|
638
|
+
parent = path.parent
|
|
639
|
+
delta_path = parent / "silver"
|
|
640
|
+
if delta_path.is_dir() and (delta_path / "_delta_log").is_dir():
|
|
641
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
642
|
+
return get_delta(force_local=True).read(str(delta_path))
|
|
643
|
+
return pd.read_parquet(path)
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
def load_gold() -> pd.DataFrame:
|
|
647
|
+
path = get_gold_path()
|
|
648
|
+
delta_path = path.parent / "gold"
|
|
649
|
+
if delta_path.is_dir() and (delta_path / "_delta_log").is_dir():
|
|
650
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
651
|
+
return get_delta(force_local=True).read(str(delta_path))
|
|
652
|
+
return pd.read_parquet(path)
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
{% set transform_groups = group_steps(config.gold.transformations) %}
|
|
656
|
+
|
|
657
|
+
def apply_gold_transformations(df: pd.DataFrame) -> pd.DataFrame:
|
|
658
|
+
{%- if transform_groups %}
|
|
659
|
+
{%- for func_name, steps in transform_groups %}
|
|
660
|
+
df = {{ func_name }}(df)
|
|
661
|
+
{%- endfor %}
|
|
662
|
+
{%- endif %}
|
|
663
|
+
return df
|
|
664
|
+
|
|
665
|
+
{% for func_name, steps in transform_groups %}
|
|
666
|
+
|
|
667
|
+
def {{ func_name }}(df: pd.DataFrame) -> pd.DataFrame:
|
|
668
|
+
{%- set _prov = provenance_docstring_block(steps) %}
|
|
669
|
+
{%- if _prov %}
|
|
670
|
+
{{ _prov }}
|
|
671
|
+
{%- endif %}
|
|
672
|
+
{%- for t in steps %}
|
|
673
|
+
# {{ t.rationale }}
|
|
674
|
+
# {{ action_description(t) }}
|
|
675
|
+
df = {{ render_step_call(t, config.fit_mode) }}
|
|
676
|
+
{%- endfor %}
|
|
677
|
+
return df
|
|
678
|
+
{% endfor %}
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def apply_encodings(df: pd.DataFrame) -> pd.DataFrame:
|
|
682
|
+
{%- set _prov = provenance_docstring_block(config.gold.encodings) %}
|
|
683
|
+
{%- if _prov %}
|
|
684
|
+
{{ _prov }}
|
|
685
|
+
{%- endif %}
|
|
686
|
+
{%- if config.gold.encodings %}
|
|
687
|
+
{%- for enc in config.gold.encodings %}
|
|
688
|
+
# {{ enc.rationale }}
|
|
689
|
+
# {{ action_description(enc) }}
|
|
690
|
+
df = {{ render_step_call(enc, config.fit_mode) }}
|
|
691
|
+
{%- endfor %}
|
|
692
|
+
{%- endif %}
|
|
693
|
+
return df
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def apply_scaling(df: pd.DataFrame) -> pd.DataFrame:
|
|
697
|
+
{%- set _prov = provenance_docstring_block(config.gold.scalings) %}
|
|
698
|
+
{%- if _prov %}
|
|
699
|
+
{{ _prov }}
|
|
700
|
+
{%- endif %}
|
|
701
|
+
{%- if config.gold.scalings %}
|
|
702
|
+
{%- for scale in config.gold.scalings %}
|
|
703
|
+
# {{ scale.rationale }}
|
|
704
|
+
# {{ action_description(scale) }}
|
|
705
|
+
df = {{ render_step_call(scale, config.fit_mode) }}
|
|
706
|
+
{%- endfor %}
|
|
707
|
+
{%- endif %}
|
|
708
|
+
return df
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
def apply_feature_selection(df: pd.DataFrame) -> pd.DataFrame:
|
|
712
|
+
{% if config.gold.feature_selections %}
|
|
713
|
+
{% for fs in config.gold.feature_selections %}
|
|
714
|
+
# Feature selection
|
|
715
|
+
# drop {{ fs }} (feature selection)
|
|
716
|
+
df = apply_feature_select(df, '{{ fs }}')
|
|
717
|
+
{% endfor %}
|
|
718
|
+
{% endif %}
|
|
719
|
+
return df
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
def get_feature_version_tag() -> str:
|
|
723
|
+
if RECOMMENDATIONS_HASH:
|
|
724
|
+
return f"v1.0.0_{RECOMMENDATIONS_HASH}"
|
|
725
|
+
return "v1.0.0"
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def add_feast_timestamp(df: pd.DataFrame, reference_date=None) -> pd.DataFrame:
|
|
729
|
+
if FEAST_TIMESTAMP_COL not in df.columns:
|
|
730
|
+
if "aggregation_reference_date" in df.attrs:
|
|
731
|
+
timestamp = df.attrs["aggregation_reference_date"]
|
|
732
|
+
print(f" Using aggregation reference_date for Feast timestamp: {timestamp}")
|
|
733
|
+
elif reference_date is not None:
|
|
734
|
+
timestamp = reference_date
|
|
735
|
+
print(f" Using provided reference_date for Feast timestamp: {timestamp}")
|
|
736
|
+
else:
|
|
737
|
+
timestamp = datetime.now()
|
|
738
|
+
warnings.warn(
|
|
739
|
+
f"No reference_date available for Feast timestamp. Using datetime.now() ({timestamp}). "
|
|
740
|
+
"This may cause temporal leakage - features should use actual aggregation dates. "
|
|
741
|
+
"Set aggregation_reference_date in DataFrame.attrs during aggregation.",
|
|
742
|
+
UserWarning
|
|
743
|
+
)
|
|
744
|
+
df[FEAST_TIMESTAMP_COL] = timestamp
|
|
745
|
+
return df
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
def materialize_to_feast(df: pd.DataFrame) -> None:
|
|
749
|
+
feast_path = get_feast_data_path()
|
|
750
|
+
feast_path.parent.mkdir(parents=True, exist_ok=True)
|
|
751
|
+
df_feast = df.copy()
|
|
752
|
+
df_feast = add_feast_timestamp(df_feast)
|
|
753
|
+
original_cols = [c for c in df_feast.columns if c.startswith("original_")]
|
|
754
|
+
if original_cols:
|
|
755
|
+
print(f" Excluding holdout columns from Feast: {original_cols}")
|
|
756
|
+
df_feast = df_feast.drop(columns=original_cols, errors="ignore")
|
|
757
|
+
try:
|
|
758
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
759
|
+
storage = get_delta(force_local=True)
|
|
760
|
+
storage.write(df_feast, str(feast_path.parent / feast_path.stem))
|
|
761
|
+
except ImportError:
|
|
762
|
+
df_feast.to_parquet(feast_path, index=False)
|
|
763
|
+
print(f"Features materialized to Feast: {feast_path}")
|
|
764
|
+
print(f" Entity key: {FEAST_ENTITY_KEY}")
|
|
765
|
+
print(f" Feature view: {FEAST_FEATURE_VIEW}")
|
|
766
|
+
print(f" Rows: {len(df_feast):,}")
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def run_gold_features():
|
|
770
|
+
silver = load_silver()
|
|
771
|
+
gold = apply_gold_transformations(silver)
|
|
772
|
+
gold = apply_encodings(gold)
|
|
773
|
+
gold = apply_scaling(gold)
|
|
774
|
+
gold = apply_feature_selection(gold)
|
|
775
|
+
{% if config.fit_mode %}
|
|
776
|
+
_store.save_manifest()
|
|
777
|
+
print(f"Fit artifacts saved to: {ARTIFACTS_PATH}")
|
|
778
|
+
{% endif %}
|
|
779
|
+
output_path = get_gold_path()
|
|
780
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
781
|
+
gold.attrs["recommendations_hash"] = RECOMMENDATIONS_HASH
|
|
782
|
+
gold.attrs["feature_version"] = get_feature_version_tag()
|
|
783
|
+
try:
|
|
784
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
785
|
+
storage = get_delta(force_local=True)
|
|
786
|
+
storage.write(gold, str(output_path.parent / "gold"))
|
|
787
|
+
except ImportError:
|
|
788
|
+
gold.to_parquet(output_path, index=False)
|
|
789
|
+
print(f"Gold features saved with version: {get_feature_version_tag()}")
|
|
790
|
+
materialize_to_feast(gold)
|
|
791
|
+
return gold
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
if __name__ == "__main__":
|
|
795
|
+
run_gold_features()
|
|
796
|
+
''',
|
|
797
|
+
"training.py.j2": '''import pandas as pd
|
|
798
|
+
import mlflow
|
|
799
|
+
import mlflow.sklearn
|
|
800
|
+
import mlflow.xgboost
|
|
801
|
+
import xgboost as xgb
|
|
802
|
+
from pathlib import Path
|
|
803
|
+
from feast import FeatureStore
|
|
804
|
+
from sklearn.model_selection import train_test_split, cross_val_score
|
|
805
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
806
|
+
from sklearn.linear_model import LogisticRegression
|
|
807
|
+
from sklearn.preprocessing import LabelEncoder
|
|
808
|
+
from sklearn.metrics import (roc_auc_score, average_precision_score, f1_score,
|
|
809
|
+
precision_score, recall_score, accuracy_score)
|
|
810
|
+
from config import (TARGET_COLUMN, PIPELINE_NAME, RECOMMENDATIONS_HASH, MLFLOW_TRACKING_URI, MLFLOW_ARTIFACT_ROOT,
|
|
811
|
+
FEAST_REPO_PATH, FEAST_FEATURE_VIEW, FEAST_ENTITY_KEY, FEAST_TIMESTAMP_COL,
|
|
812
|
+
get_feast_data_path)
|
|
813
|
+
|
|
814
|
+
# Set tracking URI immediately to prevent default mlruns directory creation
|
|
815
|
+
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def _load_feast_data():
|
|
819
|
+
feast_path = get_feast_data_path()
|
|
820
|
+
delta_path = feast_path.parent / feast_path.stem
|
|
821
|
+
if delta_path.is_dir() and (delta_path / "_delta_log").is_dir():
|
|
822
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
823
|
+
return get_delta(force_local=True).read(str(delta_path))
|
|
824
|
+
return pd.read_parquet(feast_path)
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
def get_training_data_from_feast() -> pd.DataFrame:
|
|
828
|
+
"""Retrieve training data from Feast for training/serving consistency.
|
|
829
|
+
|
|
830
|
+
Uses get_historical_features for point-in-time correct feature retrieval.
|
|
831
|
+
This ensures training uses the exact same feature retrieval path as inference.
|
|
832
|
+
"""
|
|
833
|
+
feast_path = Path(FEAST_REPO_PATH)
|
|
834
|
+
|
|
835
|
+
# Check if Feast repo is initialized
|
|
836
|
+
if not (feast_path / "feature_store.yaml").exists():
|
|
837
|
+
print("Feast repo not initialized, falling back to data file")
|
|
838
|
+
return _load_feast_data()
|
|
839
|
+
|
|
840
|
+
try:
|
|
841
|
+
store = FeatureStore(repo_path=str(feast_path))
|
|
842
|
+
|
|
843
|
+
# Read the materialized features to get entity keys and timestamps
|
|
844
|
+
features_df = _load_feast_data()
|
|
845
|
+
|
|
846
|
+
# Create entity dataframe for historical feature retrieval
|
|
847
|
+
entity_df = features_df[[FEAST_ENTITY_KEY, FEAST_TIMESTAMP_COL]].copy()
|
|
848
|
+
|
|
849
|
+
# Get all feature names (excluding entity key, timestamp, target, and holdout ground truth)
|
|
850
|
+
exclude_cols = {FEAST_ENTITY_KEY, FEAST_TIMESTAMP_COL, TARGET_COLUMN}
|
|
851
|
+
feature_cols = [c for c in features_df.columns
|
|
852
|
+
if c not in exclude_cols and not c.startswith("original_")]
|
|
853
|
+
|
|
854
|
+
# Build feature references
|
|
855
|
+
feature_refs = [f"{FEAST_FEATURE_VIEW}:{col}" for col in feature_cols]
|
|
856
|
+
|
|
857
|
+
print(f"Retrieving {len(feature_refs)} features from Feast...")
|
|
858
|
+
print(f" Feature view: {FEAST_FEATURE_VIEW}")
|
|
859
|
+
print(f" Entity key: {FEAST_ENTITY_KEY}")
|
|
860
|
+
|
|
861
|
+
# Get historical features with point-in-time correctness
|
|
862
|
+
training_df = store.get_historical_features(
|
|
863
|
+
entity_df=entity_df,
|
|
864
|
+
features=feature_refs
|
|
865
|
+
).to_df()
|
|
866
|
+
|
|
867
|
+
# Add target column back
|
|
868
|
+
training_df = training_df.merge(
|
|
869
|
+
features_df[[FEAST_ENTITY_KEY, TARGET_COLUMN]],
|
|
870
|
+
on=FEAST_ENTITY_KEY,
|
|
871
|
+
how="left"
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
print(f" Retrieved {len(training_df):,} rows, {len(training_df.columns)} columns")
|
|
875
|
+
return training_df
|
|
876
|
+
|
|
877
|
+
except Exception as e:
|
|
878
|
+
print(f"Feast retrieval failed ({e}), falling back to data file")
|
|
879
|
+
return _load_feast_data()
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
883
|
+
"""Prepare features for model training.
|
|
884
|
+
|
|
885
|
+
Explicitly excludes original_* columns which contain holdout ground truth.
|
|
886
|
+
These columns are reserved for scoring validation and must never be used in training.
|
|
887
|
+
"""
|
|
888
|
+
df = df.copy()
|
|
889
|
+
|
|
890
|
+
# Drop Feast metadata columns
|
|
891
|
+
drop_cols = [FEAST_ENTITY_KEY, FEAST_TIMESTAMP_COL]
|
|
892
|
+
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")
|
|
893
|
+
|
|
894
|
+
# Exclude original_* columns (holdout ground truth - prevents data leakage)
|
|
895
|
+
original_cols = [c for c in df.columns if c.startswith("original_")]
|
|
896
|
+
df = df.drop(columns=original_cols, errors="ignore")
|
|
897
|
+
|
|
898
|
+
# Encode categorical columns
|
|
899
|
+
for col in df.select_dtypes(include=["object", "category"]).columns:
|
|
900
|
+
df[col] = LabelEncoder().fit_transform(df[col].astype(str))
|
|
901
|
+
|
|
902
|
+
return df.select_dtypes(include=["int64", "float64", "int32", "float32"]).fillna(0)
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
def compute_metrics(y_true, y_proba, y_pred) -> dict:
|
|
906
|
+
return {
|
|
907
|
+
"roc_auc": roc_auc_score(y_true, y_proba),
|
|
908
|
+
"pr_auc": average_precision_score(y_true, y_proba),
|
|
909
|
+
"f1": f1_score(y_true, y_pred),
|
|
910
|
+
"precision": precision_score(y_true, y_pred),
|
|
911
|
+
"recall": recall_score(y_true, y_pred),
|
|
912
|
+
"accuracy": accuracy_score(y_true, y_pred),
|
|
913
|
+
}
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
def get_feature_importance(model, feature_names) -> pd.DataFrame:
|
|
917
|
+
if hasattr(model, "feature_importances_"):
|
|
918
|
+
importance = model.feature_importances_
|
|
919
|
+
elif hasattr(model, "coef_"):
|
|
920
|
+
importance = abs(model.coef_[0])
|
|
921
|
+
else:
|
|
922
|
+
return None
|
|
923
|
+
df = pd.DataFrame({"feature": feature_names, "importance": importance})
|
|
924
|
+
return df.sort_values("importance", ascending=False).reset_index(drop=True)
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
def log_feature_importance(model, feature_names):
|
|
928
|
+
fi = get_feature_importance(model, feature_names)
|
|
929
|
+
if fi is None:
|
|
930
|
+
return
|
|
931
|
+
fi.to_csv("feature_importance.csv", index=False)
|
|
932
|
+
mlflow.log_artifact("feature_importance.csv")
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
def train_xgboost(X_train, y_train, X_test, y_test, feature_names):
|
|
936
|
+
mlflow.xgboost.autolog(log_datasets=False, log_models=False)
|
|
937
|
+
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
|
|
938
|
+
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names)
|
|
939
|
+
params = {"objective": "binary:logistic", "eval_metric": ["auc", "logloss"],
|
|
940
|
+
"max_depth": 6, "learning_rate": 0.1, "seed": 42}
|
|
941
|
+
model = xgb.train(params, dtrain, num_boost_round=100,
|
|
942
|
+
evals=[(dtrain, "train"), (dtest, "eval")], verbose_eval=False)
|
|
943
|
+
return model
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
def get_model_name_with_hash(base_name: str) -> str:
|
|
947
|
+
if RECOMMENDATIONS_HASH:
|
|
948
|
+
return f"{base_name}_{RECOMMENDATIONS_HASH}"
|
|
949
|
+
return base_name
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
def run_experiment():
|
|
953
|
+
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
|
|
954
|
+
experiment = mlflow.get_experiment_by_name(PIPELINE_NAME)
|
|
955
|
+
if experiment is None:
|
|
956
|
+
mlflow.create_experiment(PIPELINE_NAME, artifact_location=MLFLOW_ARTIFACT_ROOT)
|
|
957
|
+
mlflow.set_experiment(PIPELINE_NAME)
|
|
958
|
+
print(f"MLflow tracking: {MLFLOW_TRACKING_URI}")
|
|
959
|
+
print(f"Artifacts: {MLFLOW_ARTIFACT_ROOT}")
|
|
960
|
+
|
|
961
|
+
# Load training data from Feast (ensures training/serving consistency)
|
|
962
|
+
print("\\nLoading training data from Feast...")
|
|
963
|
+
training_data = get_training_data_from_feast()
|
|
964
|
+
|
|
965
|
+
y = training_data[TARGET_COLUMN]
|
|
966
|
+
X = prepare_features(training_data.drop(columns=[TARGET_COLUMN]))
|
|
967
|
+
feature_names = list(X.columns)
|
|
968
|
+
train_mask = y.notna()
|
|
969
|
+
X, y = X.loc[train_mask], y.loc[train_mask]
|
|
970
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
|
971
|
+
|
|
972
|
+
sklearn_models = {
|
|
973
|
+
"logistic_regression": LogisticRegression(max_iter=5000, random_state=42),
|
|
974
|
+
"random_forest": RandomForestClassifier(n_estimators=100, random_state=42),
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
run_name = get_model_name_with_hash("pipeline_run")
|
|
978
|
+
with mlflow.start_run(run_name=run_name):
|
|
979
|
+
mlflow.log_params({"train_samples": len(X_train), "test_samples": len(X_test), "n_features": X.shape[1]})
|
|
980
|
+
mlflow.set_tag("feature_source", "feast")
|
|
981
|
+
mlflow.set_tag("feast_feature_view", FEAST_FEATURE_VIEW)
|
|
982
|
+
if RECOMMENDATIONS_HASH:
|
|
983
|
+
mlflow.set_tag("recommendations_hash", RECOMMENDATIONS_HASH)
|
|
984
|
+
best_model, best_auc = None, 0
|
|
985
|
+
|
|
986
|
+
for name, model in sklearn_models.items():
|
|
987
|
+
with mlflow.start_run(run_name=name, nested=True):
|
|
988
|
+
if RECOMMENDATIONS_HASH:
|
|
989
|
+
mlflow.set_tag("recommendations_hash", RECOMMENDATIONS_HASH)
|
|
990
|
+
mlflow.set_tag("feature_source", "feast")
|
|
991
|
+
model.fit(X_train, y_train)
|
|
992
|
+
y_proba = model.predict_proba(X_test)[:, 1]
|
|
993
|
+
y_pred = model.predict(X_test)
|
|
994
|
+
metrics = compute_metrics(y_test, y_proba, y_pred)
|
|
995
|
+
cv = cross_val_score(model, X_train, y_train, cv=5, scoring="roc_auc")
|
|
996
|
+
mlflow.log_metrics({**metrics, "cv_mean": cv.mean(), "cv_std": cv.std()})
|
|
997
|
+
log_feature_importance(model, feature_names)
|
|
998
|
+
model_artifact_name = get_model_name_with_hash(f"model_{name}")
|
|
999
|
+
mlflow.sklearn.log_model(model, name=model_artifact_name)
|
|
1000
|
+
print(f"{name}: ROC-AUC={metrics['roc_auc']:.4f}, PR-AUC={metrics['pr_auc']:.4f}, F1={metrics['f1']:.4f}")
|
|
1001
|
+
if metrics["roc_auc"] > best_auc:
|
|
1002
|
+
best_auc, best_model = metrics["roc_auc"], name
|
|
1003
|
+
|
|
1004
|
+
with mlflow.start_run(run_name="xgboost", nested=True):
|
|
1005
|
+
if RECOMMENDATIONS_HASH:
|
|
1006
|
+
mlflow.set_tag("recommendations_hash", RECOMMENDATIONS_HASH)
|
|
1007
|
+
mlflow.set_tag("feature_source", "feast")
|
|
1008
|
+
xgb_model = train_xgboost(X_train, y_train, X_test, y_test, feature_names)
|
|
1009
|
+
dtest = xgb.DMatrix(X_test, feature_names=feature_names)
|
|
1010
|
+
y_proba = xgb_model.predict(dtest)
|
|
1011
|
+
y_pred = (y_proba > 0.5).astype(int)
|
|
1012
|
+
metrics = compute_metrics(y_test, y_proba, y_pred)
|
|
1013
|
+
mlflow.log_metrics(metrics)
|
|
1014
|
+
xgb_model_name = get_model_name_with_hash("model_xgboost")
|
|
1015
|
+
mlflow.xgboost.log_model(xgb_model, name=xgb_model_name)
|
|
1016
|
+
importance = xgb_model.get_score(importance_type="gain")
|
|
1017
|
+
fi = pd.DataFrame({"feature": importance.keys(), "importance": importance.values()})
|
|
1018
|
+
fi = fi.sort_values("importance", ascending=False).reset_index(drop=True)
|
|
1019
|
+
fi.to_csv("feature_importance.csv", index=False)
|
|
1020
|
+
mlflow.log_artifact("feature_importance.csv")
|
|
1021
|
+
print(f"xgboost: ROC-AUC={metrics['roc_auc']:.4f}, PR-AUC={metrics['pr_auc']:.4f}, F1={metrics['f1']:.4f}")
|
|
1022
|
+
if metrics["roc_auc"] > best_auc:
|
|
1023
|
+
best_auc, best_model = metrics["roc_auc"], "xgboost"
|
|
1024
|
+
|
|
1025
|
+
mlflow.set_tag("best_model", best_model)
|
|
1026
|
+
mlflow.log_metric("best_roc_auc", best_auc)
|
|
1027
|
+
print(f"Best: {best_model} (ROC-AUC={best_auc:.4f})")
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
if __name__ == "__main__":
|
|
1031
|
+
run_experiment()
|
|
1032
|
+
''',
|
|
1033
|
+
"runner.py.j2": '''import argparse
|
|
1034
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
1035
|
+
from config import PIPELINE_NAME, EXPERIMENTS_DIR, PRODUCTION_DIR
|
|
1036
|
+
{% for name in config.landing %}
|
|
1037
|
+
from landing.landing_{{ name }} import run_landing_{{ name }}
|
|
1038
|
+
{% endfor %}
|
|
1039
|
+
{% for name, _ in config.bronze.items() %}
|
|
1040
|
+
from bronze.bronze_{{ name }} import run_bronze_{{ name }}
|
|
1041
|
+
{% endfor %}
|
|
1042
|
+
{% for name, _ in config.bronze_event.items() %}
|
|
1043
|
+
from bronze.bronze_{{ name }} import run_bronze_{{ name }}
|
|
1044
|
+
{% endfor %}
|
|
1045
|
+
from silver.silver_merge import run_silver_merge
|
|
1046
|
+
from gold.gold_features import run_gold_features
|
|
1047
|
+
from training.ml_experiment import run_experiment
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
def setup_experiments_dir():
|
|
1051
|
+
EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
1052
|
+
(EXPERIMENTS_DIR / "mlruns").mkdir(parents=True, exist_ok=True)
|
|
1053
|
+
PRODUCTION_DIR.mkdir(parents=True, exist_ok=True)
|
|
1054
|
+
(PRODUCTION_DIR / "data" / "bronze").mkdir(parents=True, exist_ok=True)
|
|
1055
|
+
(PRODUCTION_DIR / "data" / "silver").mkdir(parents=True, exist_ok=True)
|
|
1056
|
+
(PRODUCTION_DIR / "data" / "gold").mkdir(parents=True, exist_ok=True)
|
|
1057
|
+
|
|
1058
|
+
|
|
1059
|
+
def run_pipeline(validate=False):
|
|
1060
|
+
print(f"Starting pipeline: {PIPELINE_NAME}")
|
|
1061
|
+
setup_experiments_dir()
|
|
1062
|
+
{% if config.landing %}
|
|
1063
|
+
|
|
1064
|
+
print("\\n[1/6] Landing (event sources)...")
|
|
1065
|
+
{% for name in config.landing %}
|
|
1066
|
+
run_landing_{{ name }}()
|
|
1067
|
+
{% endfor %}
|
|
1068
|
+
print("Landing complete")
|
|
1069
|
+
if validate:
|
|
1070
|
+
from validation.validate_pipeline import validate_landing
|
|
1071
|
+
validate_landing()
|
|
1072
|
+
{% endif %}
|
|
1073
|
+
|
|
1074
|
+
print("\\n[{{ '2/6' if config.landing else '1/4' }}] Bronze (parallel)...")
|
|
1075
|
+
with ThreadPoolExecutor(max_workers={{ (config.bronze | length) + (config.bronze_event | length) }}) as executor:
|
|
1076
|
+
bronze_futures = [
|
|
1077
|
+
{% for name in config.bronze %}
|
|
1078
|
+
executor.submit(run_bronze_{{ name }}),
|
|
1079
|
+
{% endfor %}
|
|
1080
|
+
{% for name in config.bronze_event %}
|
|
1081
|
+
executor.submit(run_bronze_{{ name }}),
|
|
1082
|
+
{% endfor %}
|
|
1083
|
+
]
|
|
1084
|
+
for f in bronze_futures:
|
|
1085
|
+
f.result()
|
|
1086
|
+
print("Bronze complete")
|
|
1087
|
+
if validate:
|
|
1088
|
+
from validation.validate_pipeline import validate_bronze
|
|
1089
|
+
validate_bronze()
|
|
1090
|
+
|
|
1091
|
+
print("\\n[{{ '3/6' if config.landing else '2/4' }}] Silver...")
|
|
1092
|
+
run_silver_merge()
|
|
1093
|
+
print("Silver complete")
|
|
1094
|
+
if validate:
|
|
1095
|
+
from validation.validate_pipeline import validate_silver
|
|
1096
|
+
validate_silver()
|
|
1097
|
+
|
|
1098
|
+
print("\\n[{{ '4/6' if config.landing else '3/4' }}] Gold...")
|
|
1099
|
+
run_gold_features()
|
|
1100
|
+
print("Gold complete")
|
|
1101
|
+
if validate:
|
|
1102
|
+
from validation.validate_pipeline import validate_gold
|
|
1103
|
+
validate_gold()
|
|
1104
|
+
|
|
1105
|
+
print("\\n[{{ '5/6' if config.landing else '4/4' }}] Training...")
|
|
1106
|
+
run_experiment()
|
|
1107
|
+
print("Training complete")
|
|
1108
|
+
if validate:
|
|
1109
|
+
from validation.validate_pipeline import validate_training
|
|
1110
|
+
validate_training()
|
|
1111
|
+
|
|
1112
|
+
|
|
1113
|
+
if __name__ == "__main__":
|
|
1114
|
+
parser = argparse.ArgumentParser()
|
|
1115
|
+
parser.add_argument("--validate", action="store_true")
|
|
1116
|
+
args = parser.parse_args()
|
|
1117
|
+
run_pipeline(validate=args.validate)
|
|
1118
|
+
''',
|
|
1119
|
+
"run_all.py.j2": '''"""{{ config.name }} - Pipeline Runner with MLflow UI
|
|
1120
|
+
|
|
1121
|
+
All artifacts (data, mlruns, feast) are stored in the experiments directory.
|
|
1122
|
+
Override location with CR_EXPERIMENTS_DIR environment variable.
|
|
1123
|
+
"""
|
|
1124
|
+
import os
|
|
1125
|
+
import sys
|
|
1126
|
+
import webbrowser
|
|
1127
|
+
import subprocess
|
|
1128
|
+
import time
|
|
1129
|
+
from pathlib import Path
|
|
1130
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
1131
|
+
|
|
1132
|
+
sys.path.insert(0, str(Path(__file__).parent))
|
|
1133
|
+
|
|
1134
|
+
from config import PIPELINE_NAME, SOURCES, MLFLOW_TRACKING_URI, EXPERIMENTS_DIR, PRODUCTION_DIR, FINDINGS_DIR
|
|
1135
|
+
{% for name in config.landing %}
|
|
1136
|
+
from landing.landing_{{ name }} import run_landing_{{ name }}
|
|
1137
|
+
{% endfor %}
|
|
1138
|
+
{% for name in config.bronze %}
|
|
1139
|
+
from bronze.bronze_{{ name }} import run_bronze_{{ name }}
|
|
1140
|
+
{% endfor %}
|
|
1141
|
+
{% for name in config.bronze_event %}
|
|
1142
|
+
from bronze.bronze_{{ name }} import run_bronze_{{ name }}
|
|
1143
|
+
{% endfor %}
|
|
1144
|
+
from silver.silver_merge import run_silver_merge
|
|
1145
|
+
from gold.gold_features import run_gold_features
|
|
1146
|
+
from training.ml_experiment import run_experiment
|
|
1147
|
+
|
|
1148
|
+
|
|
1149
|
+
def setup_experiments_dir():
|
|
1150
|
+
EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
1151
|
+
(EXPERIMENTS_DIR / "mlruns").mkdir(parents=True, exist_ok=True)
|
|
1152
|
+
PRODUCTION_DIR.mkdir(parents=True, exist_ok=True)
|
|
1153
|
+
(PRODUCTION_DIR / "data" / "bronze").mkdir(parents=True, exist_ok=True)
|
|
1154
|
+
(PRODUCTION_DIR / "data" / "silver").mkdir(parents=True, exist_ok=True)
|
|
1155
|
+
(PRODUCTION_DIR / "data" / "gold").mkdir(parents=True, exist_ok=True)
|
|
1156
|
+
print(f"Experiments directory: {EXPERIMENTS_DIR}")
|
|
1157
|
+
print(f"Production directory: {PRODUCTION_DIR}")
|
|
1158
|
+
print(f"MLflow tracking: {MLFLOW_TRACKING_URI}")
|
|
1159
|
+
print(f"Findings directory: {FINDINGS_DIR}")
|
|
1160
|
+
|
|
1161
|
+
|
|
1162
|
+
def run_landing():
|
|
1163
|
+
{% for name in config.landing %}
|
|
1164
|
+
run_landing_{{ name }}()
|
|
1165
|
+
{% endfor %}
|
|
1166
|
+
pass
|
|
1167
|
+
|
|
1168
|
+
|
|
1169
|
+
def run_bronze_parallel():
|
|
1170
|
+
bronze_funcs = [
|
|
1171
|
+
{% for name in config.bronze %}
|
|
1172
|
+
run_bronze_{{ name }},
|
|
1173
|
+
{% endfor %}
|
|
1174
|
+
{% for name in config.bronze_event %}
|
|
1175
|
+
run_bronze_{{ name }},
|
|
1176
|
+
{% endfor %}
|
|
1177
|
+
]
|
|
1178
|
+
with ThreadPoolExecutor(max_workers={{ (config.bronze | length) + (config.bronze_event | length) }}) as ex:
|
|
1179
|
+
list(ex.map(lambda f: f(), bronze_funcs))
|
|
1180
|
+
|
|
1181
|
+
|
|
1182
|
+
def is_port_in_use(port):
|
|
1183
|
+
import socket
|
|
1184
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
1185
|
+
return s.connect_ex(('localhost', port)) == 0
|
|
1186
|
+
|
|
1187
|
+
|
|
1188
|
+
def start_mlflow_ui():
|
|
1189
|
+
port = 5050
|
|
1190
|
+
if is_port_in_use(port):
|
|
1191
|
+
print(f"\\n⚠ Port {port} is already in use.")
|
|
1192
|
+
print(f" Either mlflow is already running, or kill the old process:")
|
|
1193
|
+
print(f" pkill -f 'mlflow ui'")
|
|
1194
|
+
print(f"\\n Opening browser to existing server...")
|
|
1195
|
+
webbrowser.open(f"http://localhost:{port}")
|
|
1196
|
+
return None
|
|
1197
|
+
|
|
1198
|
+
print(f"\\nStarting MLflow UI (tracking: {MLFLOW_TRACKING_URI})...")
|
|
1199
|
+
process = subprocess.Popen(
|
|
1200
|
+
["mlflow", "ui", "--backend-store-uri", MLFLOW_TRACKING_URI, "--port", str(port)],
|
|
1201
|
+
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
|
|
1202
|
+
)
|
|
1203
|
+
time.sleep(2)
|
|
1204
|
+
webbrowser.open(f"http://localhost:{port}")
|
|
1205
|
+
print(f"MLflow UI running at http://localhost:{port}")
|
|
1206
|
+
print("Press Ctrl+C to stop")
|
|
1207
|
+
return process
|
|
1208
|
+
|
|
1209
|
+
|
|
1210
|
+
def run_pipeline():
|
|
1211
|
+
print(f"Running {PIPELINE_NAME}")
|
|
1212
|
+
print("=" * 50)
|
|
1213
|
+
|
|
1214
|
+
setup_experiments_dir()
|
|
1215
|
+
{% if config.landing %}
|
|
1216
|
+
|
|
1217
|
+
print("\\n[1/6] Landing (event sources)...")
|
|
1218
|
+
run_landing()
|
|
1219
|
+
print("Landing complete")
|
|
1220
|
+
{% endif %}
|
|
1221
|
+
|
|
1222
|
+
print("\\n[{{ '2/6' if config.landing else '1/4' }}] Bronze (parallel)...")
|
|
1223
|
+
run_bronze_parallel()
|
|
1224
|
+
print("Bronze complete")
|
|
1225
|
+
|
|
1226
|
+
print("\\n[{{ '3/6' if config.landing else '2/4' }}] Silver...")
|
|
1227
|
+
run_silver_merge()
|
|
1228
|
+
print("Silver complete")
|
|
1229
|
+
|
|
1230
|
+
print("\\n[{{ '4/6' if config.landing else '3/4' }}] Gold...")
|
|
1231
|
+
run_gold_features()
|
|
1232
|
+
print("Gold complete")
|
|
1233
|
+
|
|
1234
|
+
print("\\n[{{ '5/6' if config.landing else '4/4' }}] Training...")
|
|
1235
|
+
run_experiment()
|
|
1236
|
+
print("Training complete")
|
|
1237
|
+
|
|
1238
|
+
print("\\n" + "=" * 50)
|
|
1239
|
+
print("Pipeline finished!")
|
|
1240
|
+
|
|
1241
|
+
mlflow_process = start_mlflow_ui()
|
|
1242
|
+
if mlflow_process:
|
|
1243
|
+
try:
|
|
1244
|
+
mlflow_process.wait()
|
|
1245
|
+
except KeyboardInterrupt:
|
|
1246
|
+
mlflow_process.terminate()
|
|
1247
|
+
print("\\nMLflow UI stopped")
|
|
1248
|
+
|
|
1249
|
+
|
|
1250
|
+
if __name__ == "__main__":
|
|
1251
|
+
run_pipeline()
|
|
1252
|
+
''',
|
|
1253
|
+
"workflow.json.j2": """{
|
|
1254
|
+
"name": "{{ config.name }}_pipeline",
|
|
1255
|
+
"tasks": [
|
|
1256
|
+
{% for name in config.landing %}
|
|
1257
|
+
{
|
|
1258
|
+
"task_key": "landing_{{ name }}",
|
|
1259
|
+
"notebook_task": {
|
|
1260
|
+
"notebook_path": "/Workspace/orchestration/{{ config.name }}/landing/landing_{{ name }}"
|
|
1261
|
+
}
|
|
1262
|
+
},
|
|
1263
|
+
{% endfor %}
|
|
1264
|
+
{% for source in config.sources %}
|
|
1265
|
+
{
|
|
1266
|
+
"task_key": "bronze_{{ source.name }}",
|
|
1267
|
+
{% if config.landing %}
|
|
1268
|
+
"depends_on": [
|
|
1269
|
+
{% for name in config.landing %}
|
|
1270
|
+
{"task_key": "landing_{{ name }}"}{{ "," if not loop.last else "" }}
|
|
1271
|
+
{% endfor %}
|
|
1272
|
+
],
|
|
1273
|
+
{% endif %}
|
|
1274
|
+
"notebook_task": {
|
|
1275
|
+
"notebook_path": "/Workspace/orchestration/{{ config.name }}/bronze/bronze_{{ source.name }}"
|
|
1276
|
+
}
|
|
1277
|
+
},
|
|
1278
|
+
{% endfor %}
|
|
1279
|
+
{
|
|
1280
|
+
"task_key": "silver_merge",
|
|
1281
|
+
"depends_on": [
|
|
1282
|
+
{% for source in config.sources %}
|
|
1283
|
+
{"task_key": "bronze_{{ source.name }}"}{{ "," if not loop.last else "" }}
|
|
1284
|
+
{% endfor %}
|
|
1285
|
+
],
|
|
1286
|
+
"notebook_task": {
|
|
1287
|
+
"notebook_path": "/Workspace/orchestration/{{ config.name }}/silver/silver_merge"
|
|
1288
|
+
}
|
|
1289
|
+
},
|
|
1290
|
+
{
|
|
1291
|
+
"task_key": "gold_features",
|
|
1292
|
+
"depends_on": [{"task_key": "silver_merge"}],
|
|
1293
|
+
"notebook_task": {
|
|
1294
|
+
"notebook_path": "/Workspace/orchestration/{{ config.name }}/gold/gold_features"
|
|
1295
|
+
}
|
|
1296
|
+
},
|
|
1297
|
+
{
|
|
1298
|
+
"task_key": "ml_experiment",
|
|
1299
|
+
"depends_on": [{"task_key": "gold_features"}],
|
|
1300
|
+
"notebook_task": {
|
|
1301
|
+
"notebook_path": "/Workspace/orchestration/{{ config.name }}/training/ml_experiment"
|
|
1302
|
+
}
|
|
1303
|
+
}
|
|
1304
|
+
]
|
|
1305
|
+
}
|
|
1306
|
+
""",
|
|
1307
|
+
"feature_store.yaml.j2": """project: {{ config.name }}
|
|
1308
|
+
registry: data/registry.db
|
|
1309
|
+
provider: local
|
|
1310
|
+
online_store:
|
|
1311
|
+
type: sqlite
|
|
1312
|
+
path: data/online_store.db
|
|
1313
|
+
offline_store:
|
|
1314
|
+
type: file
|
|
1315
|
+
entity_key_serialization_version: 2
|
|
1316
|
+
""",
|
|
1317
|
+
"features.py.j2": '''"""Feast Feature Definitions for {{ config.name }}
|
|
1318
|
+
|
|
1319
|
+
Auto-generated feature view definitions for training/serving consistency.
|
|
1320
|
+
Feature version: {{ config.recommendations_hash or "unversioned" }}
|
|
1321
|
+
"""
|
|
1322
|
+
from datetime import timedelta
|
|
1323
|
+
from feast import Entity, FeatureView, Field, FileSource
|
|
1324
|
+
from feast.types import Float32, Float64, Int64, String
|
|
1325
|
+
|
|
1326
|
+
# Entity definition
|
|
1327
|
+
{{ config.feast.entity_name if config.feast else "customer" }} = Entity(
|
|
1328
|
+
name="{{ config.feast.entity_name if config.feast else 'customer' }}",
|
|
1329
|
+
join_keys=["{{ config.feast.entity_key if config.feast else config.sources[0].entity_key }}"],
|
|
1330
|
+
description="Primary entity for {{ config.name }} pipeline"
|
|
1331
|
+
)
|
|
1332
|
+
|
|
1333
|
+
# File source pointing to materialized features
|
|
1334
|
+
{{ config.feast.feature_view_name if config.feast else config.name + '_features' }}_source = FileSource(
|
|
1335
|
+
path="data/{{ config.feast.feature_view_name if config.feast else config.name + '_features' }}.parquet",
|
|
1336
|
+
timestamp_field="{{ config.feast.timestamp_column if config.feast else 'event_timestamp' }}"
|
|
1337
|
+
)
|
|
1338
|
+
|
|
1339
|
+
# Feature view definition
|
|
1340
|
+
# Note: Features are dynamically determined from the parquet file schema
|
|
1341
|
+
# This is a placeholder that gets populated when feast apply is run
|
|
1342
|
+
{{ config.feast.feature_view_name if config.feast else config.name + '_features' }} = FeatureView(
|
|
1343
|
+
name="{{ config.feast.feature_view_name if config.feast else config.name + '_features' }}",
|
|
1344
|
+
entities=[{{ config.feast.entity_name if config.feast else "customer" }}],
|
|
1345
|
+
ttl=timedelta(days={{ config.feast.ttl_days if config.feast else 365 }}),
|
|
1346
|
+
source={{ config.feast.feature_view_name if config.feast else config.name + '_features' }}_source,
|
|
1347
|
+
tags={
|
|
1348
|
+
"pipeline": "{{ config.name }}",
|
|
1349
|
+
"recommendations_hash": "{{ config.recommendations_hash or 'none' }}",
|
|
1350
|
+
"version": "v1.0.0_{{ config.recommendations_hash or 'unversioned' }}"
|
|
1351
|
+
}
|
|
1352
|
+
)
|
|
1353
|
+
''',
|
|
1354
|
+
"landing.py.j2": '''import pandas as pd
|
|
1355
|
+
import numpy as np
|
|
1356
|
+
from pathlib import Path
|
|
1357
|
+
from config import RAW_SOURCES, PRODUCTION_DIR
|
|
1358
|
+
|
|
1359
|
+
SOURCE_NAME = "{{ name }}"
|
|
1360
|
+
ENTITY_COLUMN = "{{ config.entity_column }}"
|
|
1361
|
+
TIME_COLUMN = "{{ config.time_column }}"
|
|
1362
|
+
TARGET_COLUMN = "{{ config.target_column }}"
|
|
1363
|
+
|
|
1364
|
+
|
|
1365
|
+
def load_raw_data() -> pd.DataFrame:
|
|
1366
|
+
source = RAW_SOURCES[SOURCE_NAME]
|
|
1367
|
+
path = Path(source["path"])
|
|
1368
|
+
if path.is_dir() and (path / "_delta_log").is_dir():
|
|
1369
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
1370
|
+
return get_delta(force_local=True).read(str(path))
|
|
1371
|
+
if not path.exists():
|
|
1372
|
+
raise FileNotFoundError(f"Raw source not found: {path}")
|
|
1373
|
+
if source["format"] == "csv":
|
|
1374
|
+
return pd.read_csv(path)
|
|
1375
|
+
return pd.read_parquet(path)
|
|
1376
|
+
|
|
1377
|
+
{% if config.timestamp_coalesce %}
|
|
1378
|
+
|
|
1379
|
+
def coalesce_timestamps(df: pd.DataFrame) -> pd.DataFrame:
|
|
1380
|
+
{% set cols = config.timestamp_coalesce.datetime_columns_ordered %}
|
|
1381
|
+
{% set out = config.timestamp_coalesce.output_column %}
|
|
1382
|
+
df["{{ out }}"] = pd.to_datetime(df["{{ cols[-1] }}"], errors="coerce")
|
|
1383
|
+
{% for col in cols[:-1] | reverse %}
|
|
1384
|
+
df["{{ out }}"] = df["{{ out }}"].fillna(pd.to_datetime(df["{{ col }}"], errors="coerce"))
|
|
1385
|
+
{% endfor %}
|
|
1386
|
+
return df
|
|
1387
|
+
{% endif %}
|
|
1388
|
+
|
|
1389
|
+
{% if config.label_timestamp %}
|
|
1390
|
+
|
|
1391
|
+
def derive_label_timestamp(df: pd.DataFrame) -> pd.DataFrame:
|
|
1392
|
+
{% set lt = config.label_timestamp %}
|
|
1393
|
+
{% set feature_ts = config.timestamp_coalesce.output_column if config.timestamp_coalesce else config.time_column %}
|
|
1394
|
+
{% if lt.label_column %}
|
|
1395
|
+
df["{{ lt.output_column }}"] = pd.to_datetime(df["{{ lt.label_column }}"], errors="coerce")
|
|
1396
|
+
df["{{ lt.output_column }}"] = df["{{ lt.output_column }}"].fillna(
|
|
1397
|
+
pd.to_datetime(df["{{ feature_ts }}"], errors="coerce") + pd.Timedelta(days={{ lt.fallback_window_days }})
|
|
1398
|
+
)
|
|
1399
|
+
{% else %}
|
|
1400
|
+
df["{{ lt.output_column }}"] = pd.to_datetime(df["{{ feature_ts }}"], errors="coerce") + pd.Timedelta(days={{ lt.fallback_window_days }})
|
|
1401
|
+
{% endif %}
|
|
1402
|
+
return df
|
|
1403
|
+
{% endif %}
|
|
1404
|
+
|
|
1405
|
+
|
|
1406
|
+
def get_landing_output_path() -> Path:
|
|
1407
|
+
return PRODUCTION_DIR / "data" / "landing" / f"{SOURCE_NAME}.parquet"
|
|
1408
|
+
|
|
1409
|
+
|
|
1410
|
+
def run_landing_{{ name }}():
|
|
1411
|
+
print(f"Landing: {SOURCE_NAME}")
|
|
1412
|
+
df = load_raw_data()
|
|
1413
|
+
print(f" Raw records: {len(df):,}")
|
|
1414
|
+
{% if config.raw_time_column %}
|
|
1415
|
+
df = df.rename(columns={"{{ config.raw_time_column }}": TIME_COLUMN})
|
|
1416
|
+
{% endif %}
|
|
1417
|
+
{% if config.original_target_column %}
|
|
1418
|
+
df = df.rename(columns={"{{ config.original_target_column }}": TARGET_COLUMN})
|
|
1419
|
+
{% endif %}
|
|
1420
|
+
{% if config.timestamp_coalesce %}
|
|
1421
|
+
df = coalesce_timestamps(df)
|
|
1422
|
+
{% endif %}
|
|
1423
|
+
{% if config.label_timestamp %}
|
|
1424
|
+
df = derive_label_timestamp(df)
|
|
1425
|
+
{% endif %}
|
|
1426
|
+
output_path = get_landing_output_path()
|
|
1427
|
+
output_dir = output_path.parent
|
|
1428
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1429
|
+
try:
|
|
1430
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
1431
|
+
storage = get_delta(force_local=True)
|
|
1432
|
+
storage.write(df, str(output_dir / SOURCE_NAME))
|
|
1433
|
+
except ImportError:
|
|
1434
|
+
df.to_parquet(output_path, index=False)
|
|
1435
|
+
print(f" Records: {len(df):,}")
|
|
1436
|
+
print(f" Output: {output_path}")
|
|
1437
|
+
return df
|
|
1438
|
+
|
|
1439
|
+
|
|
1440
|
+
if __name__ == "__main__":
|
|
1441
|
+
run_landing_{{ name }}()
|
|
1442
|
+
''',
|
|
1443
|
+
"bronze_event.py.j2": '''import pandas as pd
|
|
1444
|
+
import numpy as np
|
|
1445
|
+
from pathlib import Path
|
|
1446
|
+
{% set ops, fitted = collect_imports(config.pre_shaping + config.post_shaping, False) %}
|
|
1447
|
+
{% if ops %}
|
|
1448
|
+
from customer_retention.transforms import {{ ops | sort | join(', ') }}
|
|
1449
|
+
{% endif %}
|
|
1450
|
+
from config import PRODUCTION_DIR, RAW_SOURCES, TARGET_COLUMN
|
|
1451
|
+
|
|
1452
|
+
SOURCE_NAME = "{{ source }}"
|
|
1453
|
+
ENTITY_COLUMN = "{{ config.entity_column }}"
|
|
1454
|
+
TIME_COLUMN = "{{ config.time_column }}"
|
|
1455
|
+
|
|
1456
|
+
{% set pre_groups = group_steps(config.pre_shaping) %}
|
|
1457
|
+
|
|
1458
|
+
def apply_pre_shaping(df: pd.DataFrame) -> pd.DataFrame:
|
|
1459
|
+
{% if config.deduplicate %}
|
|
1460
|
+
df = df.drop_duplicates(subset=[ENTITY_COLUMN, TIME_COLUMN], keep="first")
|
|
1461
|
+
{% endif %}
|
|
1462
|
+
{%- if pre_groups %}
|
|
1463
|
+
{%- for func_name, steps in pre_groups %}
|
|
1464
|
+
df = {{ func_name }}(df)
|
|
1465
|
+
{%- endfor %}
|
|
1466
|
+
{%- endif %}
|
|
1467
|
+
return df
|
|
1468
|
+
|
|
1469
|
+
{% for func_name, steps in pre_groups %}
|
|
1470
|
+
|
|
1471
|
+
def {{ func_name }}(df: pd.DataFrame) -> pd.DataFrame:
|
|
1472
|
+
{%- set _prov = provenance_docstring_block(steps) %}
|
|
1473
|
+
{%- if _prov %}
|
|
1474
|
+
{{ _prov }}
|
|
1475
|
+
{%- endif %}
|
|
1476
|
+
{%- for t in steps %}
|
|
1477
|
+
# {{ t.rationale }}
|
|
1478
|
+
# {{ action_description(t) }}
|
|
1479
|
+
df = {{ render_step_call(t) }}
|
|
1480
|
+
{%- endfor %}
|
|
1481
|
+
return df
|
|
1482
|
+
{% endfor %}
|
|
1483
|
+
|
|
1484
|
+
{% if config.aggregation %}
|
|
1485
|
+
def _parse_window(window_str):
|
|
1486
|
+
if window_str == "all_time":
|
|
1487
|
+
return None
|
|
1488
|
+
if window_str.endswith("d"):
|
|
1489
|
+
return pd.Timedelta(days=int(window_str[:-1]))
|
|
1490
|
+
if window_str.endswith("h"):
|
|
1491
|
+
return pd.Timedelta(hours=int(window_str[:-1]))
|
|
1492
|
+
if window_str.endswith("w"):
|
|
1493
|
+
return pd.Timedelta(weeks=int(window_str[:-1]))
|
|
1494
|
+
return pd.Timedelta(days=int(window_str))
|
|
1495
|
+
|
|
1496
|
+
|
|
1497
|
+
AGGREGATION_WINDOWS = {{ config.aggregation.windows }}
|
|
1498
|
+
VALUE_COLUMNS = {{ config.aggregation.value_columns }}
|
|
1499
|
+
AGG_FUNCS = {{ config.aggregation.agg_funcs }}
|
|
1500
|
+
{% endif %}
|
|
1501
|
+
|
|
1502
|
+
|
|
1503
|
+
def apply_reshaping(df: pd.DataFrame) -> pd.DataFrame:
|
|
1504
|
+
{% if config.aggregation %}
|
|
1505
|
+
df[TIME_COLUMN] = pd.to_datetime(df[TIME_COLUMN])
|
|
1506
|
+
reference_date = df[TIME_COLUMN].max()
|
|
1507
|
+
result = df.groupby(ENTITY_COLUMN).agg("first")[[]]
|
|
1508
|
+
if TARGET_COLUMN in df.columns:
|
|
1509
|
+
result[TARGET_COLUMN] = df.groupby(ENTITY_COLUMN)[TARGET_COLUMN].first()
|
|
1510
|
+
for window in AGGREGATION_WINDOWS:
|
|
1511
|
+
td = _parse_window(window)
|
|
1512
|
+
window_df = df if td is None else df[df[TIME_COLUMN] >= (reference_date - td)]
|
|
1513
|
+
for col in VALUE_COLUMNS:
|
|
1514
|
+
for func in AGG_FUNCS:
|
|
1515
|
+
result[f"{col}_{func}_{window}"] = window_df.groupby(ENTITY_COLUMN)[col].agg(func)
|
|
1516
|
+
result[f"event_count_{window}"] = window_df.groupby(ENTITY_COLUMN).size()
|
|
1517
|
+
df = result.reset_index()
|
|
1518
|
+
{% endif %}
|
|
1519
|
+
return df
|
|
1520
|
+
|
|
1521
|
+
{% if config.lifecycle %}
|
|
1522
|
+
|
|
1523
|
+
def _load_raw_events():
|
|
1524
|
+
source = RAW_SOURCES[SOURCE_NAME]
|
|
1525
|
+
path = Path(source["path"])
|
|
1526
|
+
if path.is_dir() and (path / "_delta_log").is_dir():
|
|
1527
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
1528
|
+
return get_delta(force_local=True).read(str(path))
|
|
1529
|
+
if not path.exists():
|
|
1530
|
+
raise FileNotFoundError(f"Raw source not found: {path}")
|
|
1531
|
+
if source["format"] == "csv":
|
|
1532
|
+
return pd.read_csv(path)
|
|
1533
|
+
return pd.read_parquet(path)
|
|
1534
|
+
|
|
1535
|
+
{% if config.lifecycle.include_recency_bucket %}
|
|
1536
|
+
|
|
1537
|
+
def add_recency_tenure(df: pd.DataFrame, raw_df: pd.DataFrame) -> pd.DataFrame:
|
|
1538
|
+
raw_df[TIME_COLUMN] = pd.to_datetime(raw_df[TIME_COLUMN])
|
|
1539
|
+
reference_date = raw_df[TIME_COLUMN].max()
|
|
1540
|
+
entity_stats = raw_df.groupby(ENTITY_COLUMN)[TIME_COLUMN].agg(["min", "max"])
|
|
1541
|
+
entity_stats["days_since_last"] = (reference_date - entity_stats["max"]).dt.days
|
|
1542
|
+
entity_stats["days_since_first"] = (reference_date - entity_stats["min"]).dt.days
|
|
1543
|
+
df = df.merge(entity_stats[["days_since_last", "days_since_first"]], left_on=ENTITY_COLUMN, right_index=True, how="left")
|
|
1544
|
+
return df
|
|
1545
|
+
|
|
1546
|
+
|
|
1547
|
+
def add_recency_buckets(df: pd.DataFrame) -> pd.DataFrame:
|
|
1548
|
+
if "days_since_last" in df.columns:
|
|
1549
|
+
df["recency_bucket"] = pd.cut(df["days_since_last"], bins=[0, 7, 30, 90, 180, 365, float("inf")],
|
|
1550
|
+
labels=["0-7d", "7-30d", "30-90d", "90-180d", "180-365d", "365d+"])
|
|
1551
|
+
return df
|
|
1552
|
+
|
|
1553
|
+
{% endif %}
|
|
1554
|
+
{% if config.lifecycle.include_lifecycle_quadrant %}
|
|
1555
|
+
|
|
1556
|
+
def add_lifecycle_quadrant(df: pd.DataFrame) -> pd.DataFrame:
|
|
1557
|
+
if "days_since_first" not in df.columns:
|
|
1558
|
+
return df
|
|
1559
|
+
tenure = df["days_since_first"]
|
|
1560
|
+
intensity_col = [c for c in df.columns if c.startswith("event_count_")]
|
|
1561
|
+
if not intensity_col:
|
|
1562
|
+
return df
|
|
1563
|
+
intensity = df[intensity_col[0]]
|
|
1564
|
+
tenure_med = tenure.median()
|
|
1565
|
+
intensity_med = intensity.median()
|
|
1566
|
+
conditions = [
|
|
1567
|
+
(tenure >= tenure_med) & (intensity >= intensity_med),
|
|
1568
|
+
(tenure >= tenure_med) & (intensity < intensity_med),
|
|
1569
|
+
(tenure < tenure_med) & (intensity >= intensity_med),
|
|
1570
|
+
(tenure < tenure_med) & (intensity < intensity_med),
|
|
1571
|
+
]
|
|
1572
|
+
labels = ["loyal", "at_risk", "new_active", "new_inactive"]
|
|
1573
|
+
df["lifecycle_quadrant"] = np.select(conditions, labels, default="unknown")
|
|
1574
|
+
return df
|
|
1575
|
+
|
|
1576
|
+
{% endif %}
|
|
1577
|
+
{% if config.lifecycle.include_cyclical_features %}
|
|
1578
|
+
|
|
1579
|
+
def add_cyclical_features(df: pd.DataFrame, raw_df: pd.DataFrame) -> pd.DataFrame:
|
|
1580
|
+
raw_df[TIME_COLUMN] = pd.to_datetime(raw_df[TIME_COLUMN])
|
|
1581
|
+
mean_dow = raw_df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.dayofweek.mean())
|
|
1582
|
+
df = df.merge(mean_dow.rename("mean_dow"), left_on=ENTITY_COLUMN, right_index=True, how="left")
|
|
1583
|
+
df["dow_sin"] = np.sin(2 * np.pi * df["mean_dow"] / 7)
|
|
1584
|
+
df["dow_cos"] = np.cos(2 * np.pi * df["mean_dow"] / 7)
|
|
1585
|
+
df = df.drop(columns=["mean_dow"], errors="ignore")
|
|
1586
|
+
return df
|
|
1587
|
+
|
|
1588
|
+
{% endif %}
|
|
1589
|
+
{% if config.lifecycle.momentum_pairs %}
|
|
1590
|
+
|
|
1591
|
+
def add_momentum_ratios(df: pd.DataFrame) -> pd.DataFrame:
|
|
1592
|
+
{% for pair in config.lifecycle.momentum_pairs %}
|
|
1593
|
+
short_col = "event_count_{{ pair.short_window }}"
|
|
1594
|
+
long_col = "event_count_{{ pair.long_window }}"
|
|
1595
|
+
if short_col in df.columns and long_col in df.columns:
|
|
1596
|
+
df["momentum_{{ pair.short_window }}_{{ pair.long_window }}"] = df[short_col] / df[long_col].replace(0, float("nan"))
|
|
1597
|
+
{% endfor %}
|
|
1598
|
+
return df
|
|
1599
|
+
|
|
1600
|
+
{% endif %}
|
|
1601
|
+
|
|
1602
|
+
def enrich_lifecycle(df: pd.DataFrame) -> pd.DataFrame:
|
|
1603
|
+
raw_df = _load_raw_events()
|
|
1604
|
+
{% if config.raw_time_column %}
|
|
1605
|
+
raw_df = raw_df.rename(columns={"{{ config.raw_time_column }}": TIME_COLUMN})
|
|
1606
|
+
{% endif %}
|
|
1607
|
+
{% if config.lifecycle.include_recency_bucket %}
|
|
1608
|
+
df = add_recency_tenure(df, raw_df)
|
|
1609
|
+
df = add_recency_buckets(df)
|
|
1610
|
+
{% endif %}
|
|
1611
|
+
{% if config.lifecycle.include_lifecycle_quadrant %}
|
|
1612
|
+
df = add_lifecycle_quadrant(df)
|
|
1613
|
+
{% endif %}
|
|
1614
|
+
{% if config.lifecycle.include_cyclical_features %}
|
|
1615
|
+
df = add_cyclical_features(df, raw_df)
|
|
1616
|
+
{% endif %}
|
|
1617
|
+
{% if config.lifecycle.momentum_pairs %}
|
|
1618
|
+
df = add_momentum_ratios(df)
|
|
1619
|
+
{% endif %}
|
|
1620
|
+
return df
|
|
1621
|
+
{% endif %}
|
|
1622
|
+
|
|
1623
|
+
{% set post_groups = group_steps(config.post_shaping) %}
|
|
1624
|
+
|
|
1625
|
+
def apply_post_shaping(df: pd.DataFrame) -> pd.DataFrame:
|
|
1626
|
+
{% if config.lifecycle %}
|
|
1627
|
+
df = enrich_lifecycle(df)
|
|
1628
|
+
{% endif %}
|
|
1629
|
+
{%- if post_groups %}
|
|
1630
|
+
{%- for func_name, steps in post_groups %}
|
|
1631
|
+
df = {{ func_name }}(df)
|
|
1632
|
+
{%- endfor %}
|
|
1633
|
+
{%- endif %}
|
|
1634
|
+
return df
|
|
1635
|
+
|
|
1636
|
+
{% for func_name, steps in post_groups %}
|
|
1637
|
+
|
|
1638
|
+
def {{ func_name }}(df: pd.DataFrame) -> pd.DataFrame:
|
|
1639
|
+
{%- set _prov = provenance_docstring_block(steps) %}
|
|
1640
|
+
{%- if _prov %}
|
|
1641
|
+
{{ _prov }}
|
|
1642
|
+
{%- endif %}
|
|
1643
|
+
{%- for t in steps %}
|
|
1644
|
+
# {{ t.rationale }}
|
|
1645
|
+
# {{ action_description(t) }}
|
|
1646
|
+
df = {{ render_step_call(t) }}
|
|
1647
|
+
{%- endfor %}
|
|
1648
|
+
return df
|
|
1649
|
+
{% endfor %}
|
|
1650
|
+
|
|
1651
|
+
|
|
1652
|
+
def run_bronze_{{ source }}():
|
|
1653
|
+
landing_dir = PRODUCTION_DIR / "data" / "landing" / SOURCE_NAME
|
|
1654
|
+
landing_parquet = PRODUCTION_DIR / "data" / "landing" / f"{SOURCE_NAME}.parquet"
|
|
1655
|
+
if landing_dir.is_dir() and (landing_dir / "_delta_log").is_dir():
|
|
1656
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
1657
|
+
df = get_delta(force_local=True).read(str(landing_dir))
|
|
1658
|
+
elif landing_parquet.exists():
|
|
1659
|
+
df = pd.read_parquet(landing_parquet)
|
|
1660
|
+
else:
|
|
1661
|
+
raise FileNotFoundError(f"Landing output not found: {landing_parquet}")
|
|
1662
|
+
df = apply_pre_shaping(df)
|
|
1663
|
+
df = apply_reshaping(df)
|
|
1664
|
+
df = apply_post_shaping(df)
|
|
1665
|
+
bronze_dir = PRODUCTION_DIR / "data" / "bronze"
|
|
1666
|
+
bronze_dir.mkdir(parents=True, exist_ok=True)
|
|
1667
|
+
try:
|
|
1668
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
1669
|
+
storage = get_delta(force_local=True)
|
|
1670
|
+
storage.write(df, str(bronze_dir / SOURCE_NAME))
|
|
1671
|
+
except ImportError:
|
|
1672
|
+
output_path = bronze_dir / f"{SOURCE_NAME}.parquet"
|
|
1673
|
+
df.to_parquet(output_path, index=False)
|
|
1674
|
+
return df
|
|
1675
|
+
|
|
1676
|
+
|
|
1677
|
+
if __name__ == "__main__":
|
|
1678
|
+
run_bronze_{{ source }}()
|
|
1679
|
+
''',
|
|
1680
|
+
"validate.py.j2": '''import sys
|
|
1681
|
+
from pathlib import Path
|
|
1682
|
+
|
|
1683
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
1684
|
+
|
|
1685
|
+
import pandas as pd
|
|
1686
|
+
import numpy as np
|
|
1687
|
+
from config import SOURCES, EXPLORATION_ARTIFACTS, EXPERIMENTS_DIR, PRODUCTION_DIR, TARGET_COLUMN
|
|
1688
|
+
|
|
1689
|
+
|
|
1690
|
+
def _load_artifact(path):
|
|
1691
|
+
path = Path(path)
|
|
1692
|
+
if path.is_dir() and (path / "_delta_log").is_dir():
|
|
1693
|
+
from customer_retention.integrations.adapters.factory import get_delta
|
|
1694
|
+
return get_delta(force_local=True).read(str(path))
|
|
1695
|
+
return pd.read_parquet(path)
|
|
1696
|
+
|
|
1697
|
+
|
|
1698
|
+
def _compare_dataframes(stage, production_path, exploration_path, entity_key=None, tolerance=1e-5):
|
|
1699
|
+
if not Path(production_path).exists() and not (Path(production_path).is_dir() and (Path(production_path) / "_delta_log").is_dir()):
|
|
1700
|
+
raise FileNotFoundError(f"[{stage}] Production output not found: {production_path}")
|
|
1701
|
+
if not Path(exploration_path).exists() and not (Path(exploration_path).is_dir() and (Path(exploration_path) / "_delta_log").is_dir()):
|
|
1702
|
+
print(f"[{stage}] SKIP - exploration artifact not found: {exploration_path}")
|
|
1703
|
+
return True
|
|
1704
|
+
|
|
1705
|
+
prod = _load_artifact(production_path)
|
|
1706
|
+
expl = _load_artifact(exploration_path)
|
|
1707
|
+
|
|
1708
|
+
if entity_key and entity_key in prod.columns and entity_key in expl.columns:
|
|
1709
|
+
prod = prod.sort_values(entity_key).reset_index(drop=True)
|
|
1710
|
+
expl = expl.sort_values(entity_key).reset_index(drop=True)
|
|
1711
|
+
|
|
1712
|
+
if prod.shape[0] != expl.shape[0]:
|
|
1713
|
+
raise AssertionError(f"[{stage}] Row count: production={prod.shape[0]} vs exploration={expl.shape[0]}")
|
|
1714
|
+
|
|
1715
|
+
prod_cols = set(prod.columns)
|
|
1716
|
+
expl_cols = set(expl.columns)
|
|
1717
|
+
missing = expl_cols - prod_cols
|
|
1718
|
+
extra = prod_cols - expl_cols
|
|
1719
|
+
if missing:
|
|
1720
|
+
print(f"[{stage}] WARNING: missing columns: {missing}")
|
|
1721
|
+
if extra:
|
|
1722
|
+
print(f"[{stage}] INFO: extra columns: {extra}")
|
|
1723
|
+
|
|
1724
|
+
common = sorted(prod_cols & expl_cols)
|
|
1725
|
+
for col in common:
|
|
1726
|
+
if pd.api.types.is_numeric_dtype(prod[col]) and pd.api.types.is_numeric_dtype(expl[col]):
|
|
1727
|
+
try:
|
|
1728
|
+
pd.testing.assert_series_equal(prod[col], expl[col], check_exact=False, rtol=tolerance, check_names=False)
|
|
1729
|
+
except AssertionError as e:
|
|
1730
|
+
delta = (prod[col].astype(float) - expl[col].astype(float)).abs()
|
|
1731
|
+
max_idx = delta.idxmax()
|
|
1732
|
+
raise AssertionError(
|
|
1733
|
+
f"[{stage}] Column '{col}' diverges at row {max_idx}: "
|
|
1734
|
+
f"production={prod[col].iloc[max_idx]} vs exploration={expl[col].iloc[max_idx]} "
|
|
1735
|
+
f"(max delta={delta.max():.2e})"
|
|
1736
|
+
) from None
|
|
1737
|
+
|
|
1738
|
+
print(f"[{stage}] PASS - {prod.shape[0]} rows, {len(common)} common cols, tolerance={tolerance}")
|
|
1739
|
+
return True
|
|
1740
|
+
|
|
1741
|
+
|
|
1742
|
+
def validate_landing(tolerance=1e-5):
|
|
1743
|
+
landing_dir = PRODUCTION_DIR / "data" / "landing"
|
|
1744
|
+
if not landing_dir.exists():
|
|
1745
|
+
print("[Landing] SKIP - no landing directory")
|
|
1746
|
+
return True
|
|
1747
|
+
for path in landing_dir.glob("*.parquet"):
|
|
1748
|
+
name = path.stem
|
|
1749
|
+
expl_key = f"landing_{name}" if f"landing_{name}" in EXPLORATION_ARTIFACTS else "landing"
|
|
1750
|
+
if expl_key in EXPLORATION_ARTIFACTS:
|
|
1751
|
+
_compare_dataframes(f"Landing/{name}", str(path), EXPLORATION_ARTIFACTS[expl_key])
|
|
1752
|
+
return True
|
|
1753
|
+
|
|
1754
|
+
|
|
1755
|
+
def validate_bronze(tolerance=1e-5):
|
|
1756
|
+
bronze_artifacts = EXPLORATION_ARTIFACTS.get("bronze", {})
|
|
1757
|
+
for name, expl_path in bronze_artifacts.items():
|
|
1758
|
+
prod_path = PRODUCTION_DIR / "data" / "bronze" / f"{name}.parquet"
|
|
1759
|
+
_compare_dataframes(f"Bronze/{name}", str(prod_path), expl_path, tolerance=tolerance)
|
|
1760
|
+
return True
|
|
1761
|
+
|
|
1762
|
+
|
|
1763
|
+
def validate_silver(tolerance=1e-5):
|
|
1764
|
+
prod_path = PRODUCTION_DIR / "data" / "silver" / "merged.parquet"
|
|
1765
|
+
expl_path = EXPLORATION_ARTIFACTS.get("silver", "")
|
|
1766
|
+
entity_key = list(SOURCES.values())[0]["entity_key"] if SOURCES else None
|
|
1767
|
+
_compare_dataframes("Silver", str(prod_path), expl_path, entity_key=entity_key, tolerance=tolerance)
|
|
1768
|
+
return True
|
|
1769
|
+
|
|
1770
|
+
|
|
1771
|
+
def validate_gold(tolerance=1e-5):
|
|
1772
|
+
prod_path = PRODUCTION_DIR / "data" / "gold" / "features.parquet"
|
|
1773
|
+
expl_path = EXPLORATION_ARTIFACTS.get("gold", "")
|
|
1774
|
+
entity_key = list(SOURCES.values())[0]["entity_key"] if SOURCES else None
|
|
1775
|
+
_compare_dataframes("Gold", str(prod_path), expl_path, entity_key=entity_key, tolerance=tolerance)
|
|
1776
|
+
return True
|
|
1777
|
+
|
|
1778
|
+
|
|
1779
|
+
def validate_training():
|
|
1780
|
+
print("[Training] PASS - training validation requires MLflow comparison (not yet implemented)")
|
|
1781
|
+
return True
|
|
1782
|
+
|
|
1783
|
+
|
|
1784
|
+
def validate_scoring(tolerance=1e-5):
|
|
1785
|
+
prod_path = PRODUCTION_DIR / "data" / "scoring" / "predictions.parquet"
|
|
1786
|
+
expl_path = EXPLORATION_ARTIFACTS.get("scoring", "")
|
|
1787
|
+
_compare_dataframes("Scoring", str(prod_path), expl_path, tolerance=tolerance)
|
|
1788
|
+
return True
|
|
1789
|
+
|
|
1790
|
+
|
|
1791
|
+
def run_all_validations(tolerance=1e-5):
|
|
1792
|
+
stages = [
|
|
1793
|
+
("Landing", lambda: validate_landing(tolerance)),
|
|
1794
|
+
("Bronze", lambda: validate_bronze(tolerance)),
|
|
1795
|
+
("Silver", lambda: validate_silver(tolerance)),
|
|
1796
|
+
("Gold", lambda: validate_gold(tolerance)),
|
|
1797
|
+
("Training", validate_training),
|
|
1798
|
+
("Scoring", lambda: validate_scoring(tolerance)),
|
|
1799
|
+
]
|
|
1800
|
+
results = []
|
|
1801
|
+
for name, fn in stages:
|
|
1802
|
+
try:
|
|
1803
|
+
fn()
|
|
1804
|
+
results.append((name, "PASS"))
|
|
1805
|
+
except Exception as e:
|
|
1806
|
+
results.append((name, f"FAIL: {e}"))
|
|
1807
|
+
break
|
|
1808
|
+
|
|
1809
|
+
print("\\nStage Validation Report")
|
|
1810
|
+
print("=" * 50)
|
|
1811
|
+
for name, status in results:
|
|
1812
|
+
print(f"[{status.split(':')[0]:4s}] {name}")
|
|
1813
|
+
return results
|
|
1814
|
+
''',
|
|
1815
|
+
"run_validation.py.j2": '''"""{{ config.name }} - Standalone Validation Runner
|
|
1816
|
+
|
|
1817
|
+
Compares pipeline outputs against exploration artifacts.
|
|
1818
|
+
Run after pipeline completes to verify correctness.
|
|
1819
|
+
"""
|
|
1820
|
+
import sys
|
|
1821
|
+
from pathlib import Path
|
|
1822
|
+
|
|
1823
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
1824
|
+
|
|
1825
|
+
from validation.validate_pipeline import run_all_validations
|
|
1826
|
+
|
|
1827
|
+
|
|
1828
|
+
if __name__ == "__main__":
|
|
1829
|
+
import argparse
|
|
1830
|
+
parser = argparse.ArgumentParser(description="Validate pipeline outputs")
|
|
1831
|
+
parser.add_argument("--tolerance", type=float, default=1e-5)
|
|
1832
|
+
args = parser.parse_args()
|
|
1833
|
+
|
|
1834
|
+
results = run_all_validations(tolerance=args.tolerance)
|
|
1835
|
+
failures = [r for r in results if not r[1].startswith("PASS")]
|
|
1836
|
+
sys.exit(1 if failures else 0)
|
|
1837
|
+
''',
|
|
1838
|
+
"exploration_report.py.j2": '''"""Exploration Report Viewer
|
|
1839
|
+
|
|
1840
|
+
Opens HTML documentation for the exploration notebooks that informed
|
|
1841
|
+
the pipeline transformations. Works both locally (file:// URI) and
|
|
1842
|
+
on Databricks (displayHTML with scroll-to-anchor injection).
|
|
1843
|
+
"""
|
|
1844
|
+
import os
|
|
1845
|
+
import webbrowser
|
|
1846
|
+
from pathlib import Path
|
|
1847
|
+
|
|
1848
|
+
# Known notebooks referenced by pipeline provenance comments
|
|
1849
|
+
KNOWN_NOTEBOOKS = [
|
|
1850
|
+
{% for nb in notebooks %}
|
|
1851
|
+
"{{ nb }}",
|
|
1852
|
+
{% endfor %}
|
|
1853
|
+
]
|
|
1854
|
+
|
|
1855
|
+
DOCS_DIR = Path(os.environ.get("CR_DOCS_BASE_URL", str(Path(__file__).parent)))
|
|
1856
|
+
|
|
1857
|
+
|
|
1858
|
+
def _is_databricks():
|
|
1859
|
+
return "DATABRICKS_RUNTIME_VERSION" in os.environ
|
|
1860
|
+
|
|
1861
|
+
|
|
1862
|
+
def list_reports():
|
|
1863
|
+
for nb in KNOWN_NOTEBOOKS:
|
|
1864
|
+
html_path = DOCS_DIR / f"{nb}.html"
|
|
1865
|
+
status = "available" if html_path.exists() else "missing"
|
|
1866
|
+
print(f" {nb}: {status}")
|
|
1867
|
+
|
|
1868
|
+
|
|
1869
|
+
if __name__ == "__main__":
|
|
1870
|
+
print("Available exploration reports:")
|
|
1871
|
+
list_reports()
|
|
1872
|
+
''',
|
|
1873
|
+
}
|
|
1874
|
+
|
|
1875
|
+
|
|
1876
|
+
class CodeRenderer:
|
|
1877
|
+
_TEMPLATE_MAP = {
|
|
1878
|
+
"config": "config.py.j2",
|
|
1879
|
+
"silver": "silver.py.j2",
|
|
1880
|
+
"gold": "gold.py.j2",
|
|
1881
|
+
"training": "training.py.j2",
|
|
1882
|
+
"runner": "runner.py.j2",
|
|
1883
|
+
"workflow": "workflow.json.j2",
|
|
1884
|
+
"run_all": "run_all.py.j2",
|
|
1885
|
+
"feast_config": "feature_store.yaml.j2",
|
|
1886
|
+
"feast_features": "features.py.j2",
|
|
1887
|
+
|
|
1888
|
+
"landing": "landing.py.j2",
|
|
1889
|
+
"bronze_event": "bronze_event.py.j2",
|
|
1890
|
+
"validation": "validate.py.j2",
|
|
1891
|
+
"run_validation": "run_validation.py.j2",
|
|
1892
|
+
"exploration_report": "exploration_report.py.j2",
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
def __init__(self):
|
|
1896
|
+
self._env = Environment(loader=InlineLoader(TEMPLATES))
|
|
1897
|
+
self._env.globals["action_description"] = action_description
|
|
1898
|
+
self._env.globals["render_step_call"] = render_step_call
|
|
1899
|
+
self._env.globals["collect_imports"] = collect_imports
|
|
1900
|
+
self._env.globals["group_steps"] = group_steps
|
|
1901
|
+
self._env.globals["provenance_docstring_block"] = provenance_docstring_block
|
|
1902
|
+
self._env.globals["provenance_key"] = provenance_key
|
|
1903
|
+
|
|
1904
|
+
def set_docs_base(self, experiments_dir: str | None) -> None:
|
|
1905
|
+
global _docs_base
|
|
1906
|
+
if experiments_dir:
|
|
1907
|
+
_docs_base = f"file://{Path(experiments_dir).resolve() / 'docs'}"
|
|
1908
|
+
else:
|
|
1909
|
+
_docs_base = "docs"
|
|
1910
|
+
|
|
1911
|
+
def _render(self, template_key: str, **context) -> str:
|
|
1912
|
+
return self._env.get_template(self._TEMPLATE_MAP[template_key]).render(**context)
|
|
1913
|
+
|
|
1914
|
+
def render_config(self, config: PipelineConfig) -> str:
|
|
1915
|
+
return self._render("config", config=config)
|
|
1916
|
+
|
|
1917
|
+
def render_bronze(self, source_name: str, bronze_config: BronzeLayerConfig) -> str:
|
|
1918
|
+
return self._env.get_template("bronze.py.j2").render(source=source_name, config=bronze_config)
|
|
1919
|
+
|
|
1920
|
+
def render_silver(self, config: PipelineConfig) -> str:
|
|
1921
|
+
return self._render("silver", config=config)
|
|
1922
|
+
|
|
1923
|
+
def render_gold(self, config: PipelineConfig) -> str:
|
|
1924
|
+
return self._render("gold", config=config)
|
|
1925
|
+
|
|
1926
|
+
def render_training(self, config: PipelineConfig) -> str:
|
|
1927
|
+
return self._render("training", config=config)
|
|
1928
|
+
|
|
1929
|
+
def render_runner(self, config: PipelineConfig) -> str:
|
|
1930
|
+
return self._render("runner", config=config)
|
|
1931
|
+
|
|
1932
|
+
def render_workflow(self, config: PipelineConfig) -> str:
|
|
1933
|
+
return self._render("workflow", config=config)
|
|
1934
|
+
|
|
1935
|
+
def render_run_all(self, config: PipelineConfig) -> str:
|
|
1936
|
+
return self._render("run_all", config=config)
|
|
1937
|
+
|
|
1938
|
+
def render_feast_config(self, config: PipelineConfig) -> str:
|
|
1939
|
+
return self._render("feast_config", config=config)
|
|
1940
|
+
|
|
1941
|
+
def render_feast_features(self, config: PipelineConfig) -> str:
|
|
1942
|
+
return self._render("feast_features", config=config)
|
|
1943
|
+
|
|
1944
|
+
|
|
1945
|
+
def render_landing(self, name: str, config: LandingLayerConfig) -> str:
|
|
1946
|
+
return self._env.get_template("landing.py.j2").render(name=name, config=config)
|
|
1947
|
+
|
|
1948
|
+
def render_bronze_event(self, source_name: str, config: BronzeEventConfig) -> str:
|
|
1949
|
+
return self._env.get_template("bronze_event.py.j2").render(source=source_name, config=config)
|
|
1950
|
+
|
|
1951
|
+
def render_validation(self, config: PipelineConfig) -> str:
|
|
1952
|
+
return self._render("validation", config=config)
|
|
1953
|
+
|
|
1954
|
+
def render_run_validation(self, config: PipelineConfig) -> str:
|
|
1955
|
+
return self._render("run_validation", config=config)
|
|
1956
|
+
|
|
1957
|
+
def render_exploration_report(self, config: PipelineConfig) -> str:
|
|
1958
|
+
notebooks = set()
|
|
1959
|
+
for bronze in config.bronze.values():
|
|
1960
|
+
for step in bronze.transformations:
|
|
1961
|
+
nb = step.source_notebook or DEFAULT_NOTEBOOK_MAP.get(step.type)
|
|
1962
|
+
if nb:
|
|
1963
|
+
notebooks.add(nb)
|
|
1964
|
+
for step in config.gold.transformations + config.gold.encodings + config.gold.scalings:
|
|
1965
|
+
nb = step.source_notebook or DEFAULT_NOTEBOOK_MAP.get(step.type)
|
|
1966
|
+
if nb:
|
|
1967
|
+
notebooks.add(nb)
|
|
1968
|
+
for step in config.silver.derived_columns:
|
|
1969
|
+
nb = step.source_notebook or DEFAULT_NOTEBOOK_MAP.get(step.type)
|
|
1970
|
+
if nb:
|
|
1971
|
+
notebooks.add(nb)
|
|
1972
|
+
for be in config.bronze_event.values():
|
|
1973
|
+
for step in be.pre_shaping + be.post_shaping:
|
|
1974
|
+
nb = step.source_notebook or DEFAULT_NOTEBOOK_MAP.get(step.type)
|
|
1975
|
+
if nb:
|
|
1976
|
+
notebooks.add(nb)
|
|
1977
|
+
return self._render("exploration_report", notebooks=sorted(notebooks))
|
|
1978
|
+
|
|
1979
|
+
|
|
1980
|
+
_StepMeta = namedtuple("_StepMeta", ["desc_tpl", "call_tpl", "import_name", "param_defaults"])
|
|
1981
|
+
|
|
1982
|
+
_STATELESS_REGISTRY = {
|
|
1983
|
+
PipelineTransformationType.IMPUTE_NULL: _StepMeta(
|
|
1984
|
+
"impute nulls in {col} with {value}",
|
|
1985
|
+
"apply_impute_null(df, '{col}', value='{value}')",
|
|
1986
|
+
"apply_impute_null", {"value": 0}),
|
|
1987
|
+
PipelineTransformationType.CAP_OUTLIER: _StepMeta(
|
|
1988
|
+
"cap outliers in {col} to [{lower}, {upper}]",
|
|
1989
|
+
"apply_cap_outlier(df, '{col}', lower={lower}, upper={upper})",
|
|
1990
|
+
"apply_cap_outlier", {"lower": 0, "upper": 1000000}),
|
|
1991
|
+
PipelineTransformationType.TYPE_CAST: _StepMeta(
|
|
1992
|
+
"cast {col} to {dtype}",
|
|
1993
|
+
"apply_type_cast(df, '{col}', dtype='{dtype}')",
|
|
1994
|
+
"apply_type_cast", {"dtype": "float"}),
|
|
1995
|
+
PipelineTransformationType.DROP_COLUMN: _StepMeta(
|
|
1996
|
+
"drop column {col}",
|
|
1997
|
+
"apply_drop_column(df, '{col}')",
|
|
1998
|
+
"apply_drop_column", {}),
|
|
1999
|
+
PipelineTransformationType.WINSORIZE: _StepMeta(
|
|
2000
|
+
"winsorize {col} to [{lower_bound}, {upper_bound}]",
|
|
2001
|
+
"apply_winsorize(df, '{col}', lower_bound={lower_bound}, upper_bound={upper_bound})",
|
|
2002
|
+
"apply_winsorize", {"lower_bound": 0, "upper_bound": 1000000}),
|
|
2003
|
+
PipelineTransformationType.SEGMENT_AWARE_CAP: _StepMeta(
|
|
2004
|
+
"segment-aware outlier cap on {col} ({n_segments} segments)",
|
|
2005
|
+
"apply_segment_aware_cap(df, '{col}', n_segments={n_segments})",
|
|
2006
|
+
"apply_segment_aware_cap", {"n_segments": 2}),
|
|
2007
|
+
PipelineTransformationType.LOG_TRANSFORM: _StepMeta(
|
|
2008
|
+
"log-transform {col}",
|
|
2009
|
+
"apply_log_transform(df, '{col}')",
|
|
2010
|
+
"apply_log_transform", {}),
|
|
2011
|
+
PipelineTransformationType.SQRT_TRANSFORM: _StepMeta(
|
|
2012
|
+
"sqrt-transform {col}",
|
|
2013
|
+
"apply_sqrt_transform(df, '{col}')",
|
|
2014
|
+
"apply_sqrt_transform", {}),
|
|
2015
|
+
PipelineTransformationType.ZERO_INFLATION_HANDLING: _StepMeta(
|
|
2016
|
+
"handle zero-inflation in {col}",
|
|
2017
|
+
"apply_zero_inflation_handling(df, '{col}')",
|
|
2018
|
+
"apply_zero_inflation_handling", {}),
|
|
2019
|
+
PipelineTransformationType.CAP_THEN_LOG: _StepMeta(
|
|
2020
|
+
"cap at p99 then log-transform {col}",
|
|
2021
|
+
"apply_cap_then_log(df, '{col}')",
|
|
2022
|
+
"apply_cap_then_log", {}),
|
|
2023
|
+
PipelineTransformationType.FEATURE_SELECT: _StepMeta(
|
|
2024
|
+
"drop {col} (feature selection)",
|
|
2025
|
+
"apply_feature_select(df, '{col}')",
|
|
2026
|
+
"apply_feature_select", {}),
|
|
2027
|
+
}
|
|
2028
|
+
|
|
2029
|
+
|
|
2030
|
+
def _extract_params(step, meta):
|
|
2031
|
+
return {k: step.parameters.get(k, v) for k, v in meta.param_defaults.items()}
|
|
2032
|
+
|
|
2033
|
+
|
|
2034
|
+
def action_description(step: TransformationStep) -> str:
|
|
2035
|
+
t, col, p = step.type, step.column, step.parameters
|
|
2036
|
+
meta = _STATELESS_REGISTRY.get(t)
|
|
2037
|
+
if meta is not None:
|
|
2038
|
+
return meta.desc_tpl.format(col=col, **_extract_params(step, meta))
|
|
2039
|
+
if t == PipelineTransformationType.YEO_JOHNSON:
|
|
2040
|
+
return f"yeo-johnson transform {col}"
|
|
2041
|
+
if t == PipelineTransformationType.ENCODE:
|
|
2042
|
+
method = p.get("method", "one_hot")
|
|
2043
|
+
if method in ("one_hot", "onehot"):
|
|
2044
|
+
return f"one-hot encode {col}"
|
|
2045
|
+
return f"label-encode {col}"
|
|
2046
|
+
if t == PipelineTransformationType.SCALE:
|
|
2047
|
+
method = p.get("method", "standard")
|
|
2048
|
+
if method == "minmax":
|
|
2049
|
+
return f"min-max scale {col}"
|
|
2050
|
+
return f"standard-scale {col}"
|
|
2051
|
+
if t == PipelineTransformationType.DERIVED_COLUMN:
|
|
2052
|
+
action = p.get("action", "ratio")
|
|
2053
|
+
if action == "ratio":
|
|
2054
|
+
return f"create {col} = {p.get('numerator', '?')} / {p.get('denominator', '?')}"
|
|
2055
|
+
if action == "interaction":
|
|
2056
|
+
features = p.get("features", [])
|
|
2057
|
+
col_a = features[0] if len(features) > 0 else p.get("col_a", "?")
|
|
2058
|
+
col_b = features[1] if len(features) > 1 else p.get("col_b", "?")
|
|
2059
|
+
return f"create {col} = {col_a} * {col_b}"
|
|
2060
|
+
if action == "composite":
|
|
2061
|
+
return f"create {col} = mean({', '.join(p.get('columns', []))})"
|
|
2062
|
+
return f"transform {col}"
|
|
2063
|
+
|
|
2064
|
+
|
|
2065
|
+
def render_step_call(step: TransformationStep, fit_mode: bool = True) -> str:
|
|
2066
|
+
t, col, p = step.type, step.column, step.parameters
|
|
2067
|
+
meta = _STATELESS_REGISTRY.get(t)
|
|
2068
|
+
if meta is not None:
|
|
2069
|
+
return meta.call_tpl.format(col=col, **_extract_params(step, meta))
|
|
2070
|
+
if t == PipelineTransformationType.YEO_JOHNSON:
|
|
2071
|
+
method = "fit_transform" if fit_mode else "transform"
|
|
2072
|
+
return f"FittedPowerTransform().{method}(df, '{col}', _store)"
|
|
2073
|
+
if t == PipelineTransformationType.ENCODE:
|
|
2074
|
+
method = p.get("method", "one_hot")
|
|
2075
|
+
if method in ("one_hot", "onehot"):
|
|
2076
|
+
return f"apply_one_hot_encode(df, '{col}')"
|
|
2077
|
+
fit_method = "fit_transform" if fit_mode else "transform"
|
|
2078
|
+
return f"FittedEncoder().{fit_method}(df, '{col}', _store)"
|
|
2079
|
+
if t == PipelineTransformationType.SCALE:
|
|
2080
|
+
method = p.get("method", "standard")
|
|
2081
|
+
fit_method = "fit_transform" if fit_mode else "transform"
|
|
2082
|
+
return f"FittedScaler('{method}').{fit_method}(df, '{col}', _store)"
|
|
2083
|
+
if t == PipelineTransformationType.DERIVED_COLUMN:
|
|
2084
|
+
action = p.get("action", "ratio")
|
|
2085
|
+
if action == "ratio":
|
|
2086
|
+
return f"apply_derived_ratio(df, '{col}', numerator='{p.get('numerator', '')}', denominator='{p.get('denominator', '')}')"
|
|
2087
|
+
if action == "interaction":
|
|
2088
|
+
features = p.get("features", [])
|
|
2089
|
+
col_a = features[0] if len(features) > 0 else p.get("col_a", "")
|
|
2090
|
+
col_b = features[1] if len(features) > 1 else p.get("col_b", "")
|
|
2091
|
+
return f"apply_derived_interaction(df, '{col}', col_a='{col_a}', col_b='{col_b}')"
|
|
2092
|
+
if action == "composite":
|
|
2093
|
+
return f"apply_derived_composite(df, '{col}', columns={p.get('columns', [])})"
|
|
2094
|
+
raise ValueError(f"Unknown transformation type: {step.type}")
|
|
2095
|
+
|
|
2096
|
+
|
|
2097
|
+
def collect_imports(steps, include_fitted):
|
|
2098
|
+
ops = set()
|
|
2099
|
+
fitted = set()
|
|
2100
|
+
_OPS_MAP = {k: v.import_name for k, v in _STATELESS_REGISTRY.items()}
|
|
2101
|
+
for step in steps:
|
|
2102
|
+
t, p = step.type, step.parameters
|
|
2103
|
+
if t in _OPS_MAP:
|
|
2104
|
+
ops.add(_OPS_MAP[t])
|
|
2105
|
+
elif t == PipelineTransformationType.ENCODE:
|
|
2106
|
+
method = p.get("method", "one_hot")
|
|
2107
|
+
if method in ("one_hot", "onehot"):
|
|
2108
|
+
ops.add("apply_one_hot_encode")
|
|
2109
|
+
elif include_fitted:
|
|
2110
|
+
fitted.add("FittedEncoder")
|
|
2111
|
+
elif t == PipelineTransformationType.SCALE:
|
|
2112
|
+
if include_fitted:
|
|
2113
|
+
fitted.add("FittedScaler")
|
|
2114
|
+
elif t == PipelineTransformationType.YEO_JOHNSON:
|
|
2115
|
+
if include_fitted:
|
|
2116
|
+
fitted.add("FittedPowerTransform")
|
|
2117
|
+
elif t == PipelineTransformationType.DERIVED_COLUMN:
|
|
2118
|
+
action = p.get("action", "ratio")
|
|
2119
|
+
if action == "ratio":
|
|
2120
|
+
ops.add("apply_derived_ratio")
|
|
2121
|
+
elif action == "interaction":
|
|
2122
|
+
ops.add("apply_derived_interaction")
|
|
2123
|
+
elif action == "composite":
|
|
2124
|
+
ops.add("apply_derived_composite")
|
|
2125
|
+
return ops, fitted
|