churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,727 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
3
|
+
|
|
4
|
+
import yaml
|
|
5
|
+
|
|
6
|
+
from customer_retention.analysis.auto_explorer.exploration_manager import (
|
|
7
|
+
DatasetInfo,
|
|
8
|
+
DatasetRelationshipInfo,
|
|
9
|
+
MultiDatasetFindings,
|
|
10
|
+
)
|
|
11
|
+
from customer_retention.analysis.auto_explorer.findings import ExplorationFindings
|
|
12
|
+
from customer_retention.analysis.auto_explorer.layered_recommendations import RecommendationRegistry
|
|
13
|
+
|
|
14
|
+
from .models import (
|
|
15
|
+
AggregationWindowConfig,
|
|
16
|
+
BronzeEventConfig,
|
|
17
|
+
BronzeLayerConfig,
|
|
18
|
+
GoldLayerConfig,
|
|
19
|
+
LabelTimestampConfig,
|
|
20
|
+
LandingLayerConfig,
|
|
21
|
+
LifecycleConfig,
|
|
22
|
+
PipelineConfig,
|
|
23
|
+
PipelineTransformationType,
|
|
24
|
+
SilverLayerConfig,
|
|
25
|
+
SourceConfig,
|
|
26
|
+
TimestampCoalesceConfig,
|
|
27
|
+
TransformationStep,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _resolve_col_type(col_finding) -> str:
|
|
32
|
+
col_type = col_finding.inferred_type
|
|
33
|
+
if hasattr(col_type, 'value'):
|
|
34
|
+
col_type = col_type.value
|
|
35
|
+
return col_type
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class FindingsParser:
|
|
39
|
+
def __init__(self, findings_dir: str):
|
|
40
|
+
self._findings_dir = Path(findings_dir)
|
|
41
|
+
self._source_findings_paths: Dict[str, Path] = {}
|
|
42
|
+
|
|
43
|
+
def parse(self) -> PipelineConfig:
|
|
44
|
+
multi_dataset = self._load_multi_dataset_findings()
|
|
45
|
+
selected_sources = list(multi_dataset.datasets.keys())
|
|
46
|
+
source_findings = self._load_source_findings(selected_sources, self._findings_dir, multi_dataset)
|
|
47
|
+
discovered_events = self._discover_event_sources(source_findings)
|
|
48
|
+
recommendations_registry = self._load_recommendations()
|
|
49
|
+
recommendations_hash = recommendations_registry.compute_recommendations_hash() if recommendations_registry else None
|
|
50
|
+
config = self._build_pipeline_config(multi_dataset, source_findings, recommendations_hash)
|
|
51
|
+
if recommendations_registry:
|
|
52
|
+
self._apply_recommendations_to_config(config, recommendations_registry, multi_dataset)
|
|
53
|
+
self._build_landing_configs(config, multi_dataset, source_findings)
|
|
54
|
+
self._build_discovered_landing_configs(config, discovered_events, multi_dataset)
|
|
55
|
+
self._build_bronze_event_configs(config, multi_dataset, source_findings, discovered_events)
|
|
56
|
+
self._reconcile_discovered_event_transforms(config, discovered_events)
|
|
57
|
+
return config
|
|
58
|
+
|
|
59
|
+
def _load_recommendations(self) -> Optional[RecommendationRegistry]:
|
|
60
|
+
recommendations_path = None
|
|
61
|
+
pattern_matches = list(self._findings_dir.glob("*_recommendations.yaml"))
|
|
62
|
+
if pattern_matches:
|
|
63
|
+
recommendations_path = max(pattern_matches, key=lambda p: p.stat().st_mtime)
|
|
64
|
+
elif (self._findings_dir / "recommendations.yaml").exists():
|
|
65
|
+
recommendations_path = self._findings_dir / "recommendations.yaml"
|
|
66
|
+
if recommendations_path and recommendations_path.exists():
|
|
67
|
+
with open(recommendations_path) as f:
|
|
68
|
+
return RecommendationRegistry.from_dict(yaml.safe_load(f))
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
def _load_multi_dataset_findings(self) -> MultiDatasetFindings:
|
|
72
|
+
path = self._findings_dir / "multi_dataset_findings.yaml"
|
|
73
|
+
if not path.exists():
|
|
74
|
+
return self._synthesize_from_single_source()
|
|
75
|
+
with open(path) as f:
|
|
76
|
+
data = yaml.safe_load(f)
|
|
77
|
+
return self._dict_to_multi_dataset_findings(data)
|
|
78
|
+
|
|
79
|
+
def _synthesize_from_single_source(self) -> MultiDatasetFindings:
|
|
80
|
+
from customer_retention.core.config.column_config import DatasetGranularity
|
|
81
|
+
|
|
82
|
+
candidates = [
|
|
83
|
+
p for p in self._findings_dir.glob("*_findings.yaml")
|
|
84
|
+
if p.name != "multi_dataset_findings.yaml"
|
|
85
|
+
]
|
|
86
|
+
if not candidates:
|
|
87
|
+
raise FileNotFoundError(
|
|
88
|
+
f"No findings files found in {self._findings_dir}"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
datasets = {}
|
|
92
|
+
first_name = None
|
|
93
|
+
for path in candidates:
|
|
94
|
+
findings = ExplorationFindings.load(str(path))
|
|
95
|
+
name = path.stem.replace("_findings", "")
|
|
96
|
+
if first_name is None:
|
|
97
|
+
first_name = name
|
|
98
|
+
datasets[name] = DatasetInfo(
|
|
99
|
+
name=name,
|
|
100
|
+
findings_path=str(path),
|
|
101
|
+
source_path=findings.source_path,
|
|
102
|
+
granularity=DatasetGranularity.ENTITY_LEVEL,
|
|
103
|
+
row_count=findings.row_count,
|
|
104
|
+
column_count=findings.column_count,
|
|
105
|
+
entity_column=(
|
|
106
|
+
findings.identifier_columns[0]
|
|
107
|
+
if findings.identifier_columns
|
|
108
|
+
else None
|
|
109
|
+
),
|
|
110
|
+
target_column=findings.target_column,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
return MultiDatasetFindings(
|
|
114
|
+
datasets=datasets,
|
|
115
|
+
primary_entity_dataset=first_name,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def _dict_to_multi_dataset_findings(self, data: Dict) -> MultiDatasetFindings:
|
|
119
|
+
from customer_retention.core.config.column_config import DatasetGranularity
|
|
120
|
+
datasets = {}
|
|
121
|
+
for name, info in data.get("datasets", {}).items():
|
|
122
|
+
granularity_str = info.get("granularity", "unknown")
|
|
123
|
+
granularity = DatasetGranularity(granularity_str) if granularity_str else DatasetGranularity.UNKNOWN
|
|
124
|
+
datasets[name] = DatasetInfo(
|
|
125
|
+
name=info["name"],
|
|
126
|
+
findings_path=info.get("findings_path", ""),
|
|
127
|
+
source_path=info.get("source_path", ""),
|
|
128
|
+
granularity=granularity,
|
|
129
|
+
row_count=info.get("row_count", 0),
|
|
130
|
+
column_count=info.get("column_count", 0),
|
|
131
|
+
entity_column=info.get("entity_column"),
|
|
132
|
+
time_column=info.get("time_column"),
|
|
133
|
+
target_column=info.get("target_column"),
|
|
134
|
+
excluded=info.get("excluded", False)
|
|
135
|
+
)
|
|
136
|
+
relationships = [
|
|
137
|
+
DatasetRelationshipInfo(
|
|
138
|
+
left_dataset=r["left_dataset"],
|
|
139
|
+
right_dataset=r["right_dataset"],
|
|
140
|
+
left_column=r["left_column"],
|
|
141
|
+
right_column=r["right_column"],
|
|
142
|
+
relationship_type=r.get("relationship_type", "one_to_many"),
|
|
143
|
+
confidence=r.get("confidence", 1.0),
|
|
144
|
+
auto_detected=r.get("auto_detected", False)
|
|
145
|
+
)
|
|
146
|
+
for r in data.get("relationships", [])
|
|
147
|
+
]
|
|
148
|
+
return MultiDatasetFindings(
|
|
149
|
+
datasets=datasets,
|
|
150
|
+
relationships=relationships,
|
|
151
|
+
primary_entity_dataset=data.get("primary_entity_dataset"),
|
|
152
|
+
event_datasets=data.get("event_datasets", []),
|
|
153
|
+
excluded_datasets=data.get("excluded_datasets", []),
|
|
154
|
+
aggregation_windows=data.get("aggregation_windows", ["24h", "7d", "30d", "90d", "180d", "365d", "all_time"]),
|
|
155
|
+
notes=data.get("notes", {}),
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def _load_source_findings(self, sources: List[str], findings_dir: Path, multi_dataset: MultiDatasetFindings = None) -> Dict[str, ExplorationFindings]:
|
|
159
|
+
result = {}
|
|
160
|
+
for name in sources:
|
|
161
|
+
path = None
|
|
162
|
+
if multi_dataset and name in multi_dataset.datasets:
|
|
163
|
+
dataset_info = multi_dataset.datasets[name]
|
|
164
|
+
if dataset_info.findings_path:
|
|
165
|
+
raw_path = Path(dataset_info.findings_path)
|
|
166
|
+
if raw_path.is_absolute():
|
|
167
|
+
path = raw_path
|
|
168
|
+
else:
|
|
169
|
+
path = (findings_dir / raw_path).resolve()
|
|
170
|
+
if not path.exists():
|
|
171
|
+
path = findings_dir / raw_path.name
|
|
172
|
+
if path is None or not path.exists():
|
|
173
|
+
candidates = list(findings_dir.glob(f"{name}_*_findings.yaml"))
|
|
174
|
+
if candidates:
|
|
175
|
+
path = candidates[0]
|
|
176
|
+
else:
|
|
177
|
+
path = findings_dir / f"{name}_findings.yaml"
|
|
178
|
+
if path.exists():
|
|
179
|
+
result[name] = ExplorationFindings.load(str(path))
|
|
180
|
+
self._source_findings_paths[name] = path.resolve()
|
|
181
|
+
return result
|
|
182
|
+
|
|
183
|
+
def _build_pipeline_config(self, multi: MultiDatasetFindings, sources: Dict[str, ExplorationFindings], recommendations_hash: Optional[str] = None) -> PipelineConfig:
|
|
184
|
+
source_configs = self._build_source_configs(multi, sources)
|
|
185
|
+
bronze_configs = self._build_bronze_configs(sources, source_configs)
|
|
186
|
+
return PipelineConfig(
|
|
187
|
+
name="",
|
|
188
|
+
target_column=self._find_target_column(sources),
|
|
189
|
+
sources=source_configs,
|
|
190
|
+
bronze=bronze_configs,
|
|
191
|
+
silver=self._build_silver_config(multi, sources),
|
|
192
|
+
gold=self._build_gold_config(sources),
|
|
193
|
+
output_dir="",
|
|
194
|
+
recommendations_hash=recommendations_hash,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def _build_source_configs(self, multi: MultiDatasetFindings, sources: Dict[str, ExplorationFindings]) -> List[SourceConfig]:
|
|
198
|
+
result = []
|
|
199
|
+
for name, findings in sources.items():
|
|
200
|
+
dataset_info = multi.datasets.get(name)
|
|
201
|
+
is_event = name in multi.event_datasets
|
|
202
|
+
is_excluded = name in multi.excluded_datasets or (dataset_info and dataset_info.excluded)
|
|
203
|
+
raw_source = str(Path(dataset_info.source_path if dataset_info else findings.source_path).resolve())
|
|
204
|
+
time_col = None
|
|
205
|
+
entity_key = findings.identifier_columns[0] if findings.identifier_columns else "id"
|
|
206
|
+
if is_event and findings.time_series_metadata:
|
|
207
|
+
time_col = findings.time_series_metadata.time_column
|
|
208
|
+
if findings.time_series_metadata.entity_column:
|
|
209
|
+
entity_key = findings.time_series_metadata.entity_column
|
|
210
|
+
result.append(SourceConfig(
|
|
211
|
+
name=name,
|
|
212
|
+
path=Path(findings.source_path).name,
|
|
213
|
+
format=findings.source_format,
|
|
214
|
+
entity_key=entity_key,
|
|
215
|
+
raw_source_path=raw_source,
|
|
216
|
+
time_column=time_col,
|
|
217
|
+
is_event_level=is_event,
|
|
218
|
+
excluded=is_excluded
|
|
219
|
+
))
|
|
220
|
+
return result
|
|
221
|
+
|
|
222
|
+
def _build_bronze_configs(self, sources: Dict[str, ExplorationFindings], source_configs: List[SourceConfig]) -> Dict[str, BronzeLayerConfig]:
|
|
223
|
+
result = {}
|
|
224
|
+
source_map = {s.name: s for s in source_configs}
|
|
225
|
+
for name, findings in sources.items():
|
|
226
|
+
source_cfg = source_map[name]
|
|
227
|
+
if source_cfg.is_event_level:
|
|
228
|
+
continue
|
|
229
|
+
result[name] = BronzeLayerConfig(source=source_cfg, transformations=self._extract_transformations(findings))
|
|
230
|
+
return result
|
|
231
|
+
|
|
232
|
+
def _extract_transformations(self, findings: ExplorationFindings) -> List[TransformationStep]:
|
|
233
|
+
transformations = []
|
|
234
|
+
for col_name, col_finding in findings.columns.items():
|
|
235
|
+
if not col_finding.cleaning_needed:
|
|
236
|
+
continue
|
|
237
|
+
for rec in col_finding.cleaning_recommendations:
|
|
238
|
+
step = self._parse_cleaning_recommendation(col_name, rec)
|
|
239
|
+
if step:
|
|
240
|
+
transformations.append(step)
|
|
241
|
+
return transformations
|
|
242
|
+
|
|
243
|
+
def _parse_cleaning_recommendation(self, column: str, recommendation: str) -> TransformationStep:
|
|
244
|
+
if ":" in recommendation:
|
|
245
|
+
action, param = recommendation.split(":", 1)
|
|
246
|
+
else:
|
|
247
|
+
action, param = recommendation, ""
|
|
248
|
+
if action == "impute_null":
|
|
249
|
+
return TransformationStep(
|
|
250
|
+
type=PipelineTransformationType.IMPUTE_NULL,
|
|
251
|
+
column=column,
|
|
252
|
+
parameters={"value": param if param else 0},
|
|
253
|
+
rationale=f"Impute nulls in {column}"
|
|
254
|
+
)
|
|
255
|
+
if action == "cap_outlier":
|
|
256
|
+
return TransformationStep(
|
|
257
|
+
type=PipelineTransformationType.CAP_OUTLIER,
|
|
258
|
+
column=column,
|
|
259
|
+
parameters={"method": param if param else "iqr"},
|
|
260
|
+
rationale=f"Cap outliers in {column}"
|
|
261
|
+
)
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
def _build_silver_config(self, multi: MultiDatasetFindings, sources: Dict[str, ExplorationFindings]) -> SilverLayerConfig:
|
|
265
|
+
joins = []
|
|
266
|
+
for rel in multi.relationships:
|
|
267
|
+
joins.append({
|
|
268
|
+
"left_key": rel.left_column,
|
|
269
|
+
"right_key": rel.right_column,
|
|
270
|
+
"right_source": rel.right_dataset,
|
|
271
|
+
"how": "left"
|
|
272
|
+
})
|
|
273
|
+
return SilverLayerConfig(joins=joins, aggregations=[])
|
|
274
|
+
|
|
275
|
+
def _build_gold_config(self, sources: Dict[str, ExplorationFindings]) -> GoldLayerConfig:
|
|
276
|
+
encodings = []
|
|
277
|
+
scalings = []
|
|
278
|
+
for findings in sources.values():
|
|
279
|
+
for col_name, col_finding in findings.columns.items():
|
|
280
|
+
col_type = _resolve_col_type(col_finding)
|
|
281
|
+
if col_type == "categorical":
|
|
282
|
+
encodings.append(TransformationStep(
|
|
283
|
+
type=PipelineTransformationType.ENCODE,
|
|
284
|
+
column=col_name,
|
|
285
|
+
parameters={"method": "one_hot"},
|
|
286
|
+
rationale=f"One-hot encode {col_name}"
|
|
287
|
+
))
|
|
288
|
+
elif col_type == "numeric":
|
|
289
|
+
scalings.append(TransformationStep(
|
|
290
|
+
type=PipelineTransformationType.SCALE,
|
|
291
|
+
column=col_name,
|
|
292
|
+
parameters={"method": "standard"},
|
|
293
|
+
rationale=f"Standardize {col_name}"
|
|
294
|
+
))
|
|
295
|
+
return GoldLayerConfig(encodings=encodings, scalings=scalings)
|
|
296
|
+
|
|
297
|
+
def _find_target_column(self, sources: Dict[str, ExplorationFindings]) -> str:
|
|
298
|
+
for findings in sources.values():
|
|
299
|
+
if findings.target_column:
|
|
300
|
+
return findings.target_column
|
|
301
|
+
return "target"
|
|
302
|
+
|
|
303
|
+
def _apply_recommendations_to_config(self, config: PipelineConfig, registry: RecommendationRegistry, multi: MultiDatasetFindings) -> None:
|
|
304
|
+
self._apply_bronze_recommendations(config, registry)
|
|
305
|
+
self._apply_silver_recommendations(config, registry)
|
|
306
|
+
self._apply_gold_recommendations(config, registry)
|
|
307
|
+
|
|
308
|
+
def _apply_bronze_recommendations(self, config: PipelineConfig, registry: RecommendationRegistry) -> None:
|
|
309
|
+
sources_to_process = dict(registry.sources)
|
|
310
|
+
if not sources_to_process and hasattr(registry, 'bronze') and registry.bronze is not None:
|
|
311
|
+
sources_to_process = {"_default": registry.bronze}
|
|
312
|
+
for source_name, bronze_recs in sources_to_process.items():
|
|
313
|
+
target_bronze = self._find_bronze_config_for_source(config, source_name, bronze_recs.source_file)
|
|
314
|
+
if target_bronze is None:
|
|
315
|
+
continue
|
|
316
|
+
for rec in bronze_recs.null_handling:
|
|
317
|
+
step = self._map_bronze_null(rec)
|
|
318
|
+
if step:
|
|
319
|
+
target_bronze.transformations.append(step)
|
|
320
|
+
for rec in bronze_recs.outlier_handling:
|
|
321
|
+
step = self._map_bronze_outlier(rec)
|
|
322
|
+
if step:
|
|
323
|
+
target_bronze.transformations.append(step)
|
|
324
|
+
target_bronze.transformations = self._deduplicate_steps(target_bronze.transformations)
|
|
325
|
+
|
|
326
|
+
@staticmethod
|
|
327
|
+
def _deduplicate_steps(steps: List[TransformationStep]) -> List[TransformationStep]:
|
|
328
|
+
seen: Set[Tuple[PipelineTransformationType, str]] = set()
|
|
329
|
+
result: List[TransformationStep] = []
|
|
330
|
+
for step in steps:
|
|
331
|
+
key = (step.type, step.column)
|
|
332
|
+
if key not in seen:
|
|
333
|
+
seen.add(key)
|
|
334
|
+
result.append(step)
|
|
335
|
+
return result
|
|
336
|
+
|
|
337
|
+
def _find_bronze_config_for_source(self, config: PipelineConfig, source_name: str, source_file: str) -> Optional[BronzeLayerConfig]:
|
|
338
|
+
if source_name in config.bronze:
|
|
339
|
+
return config.bronze[source_name]
|
|
340
|
+
source_path = Path(source_file) if source_file else None
|
|
341
|
+
for name, bronze in config.bronze.items():
|
|
342
|
+
if source_path and Path(bronze.source.path).name == source_path.name:
|
|
343
|
+
return bronze
|
|
344
|
+
if source_path and Path(bronze.source.raw_source_path).name == source_path.name:
|
|
345
|
+
return bronze
|
|
346
|
+
if len(config.bronze) == 1:
|
|
347
|
+
return next(iter(config.bronze.values()))
|
|
348
|
+
return None
|
|
349
|
+
|
|
350
|
+
def _map_bronze_null(self, rec) -> Optional[TransformationStep]:
|
|
351
|
+
strategy = rec.parameters.get("strategy", "median")
|
|
352
|
+
if strategy == "drop":
|
|
353
|
+
return TransformationStep(
|
|
354
|
+
type=PipelineTransformationType.DROP_COLUMN,
|
|
355
|
+
column=rec.target_column,
|
|
356
|
+
parameters={"strategy": "drop"},
|
|
357
|
+
rationale=rec.rationale,
|
|
358
|
+
source_notebook=rec.source_notebook,
|
|
359
|
+
)
|
|
360
|
+
return TransformationStep(
|
|
361
|
+
type=PipelineTransformationType.IMPUTE_NULL,
|
|
362
|
+
column=rec.target_column,
|
|
363
|
+
parameters={"value": strategy},
|
|
364
|
+
rationale=rec.rationale,
|
|
365
|
+
source_notebook=rec.source_notebook,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
def _map_bronze_outlier(self, rec) -> Optional[TransformationStep]:
|
|
369
|
+
if rec.action == "segment_aware_cap":
|
|
370
|
+
return TransformationStep(
|
|
371
|
+
type=PipelineTransformationType.SEGMENT_AWARE_CAP,
|
|
372
|
+
column=rec.target_column,
|
|
373
|
+
parameters={
|
|
374
|
+
"method": rec.parameters.get("method", "segment_iqr"),
|
|
375
|
+
"n_segments": rec.parameters.get("n_segments", 2),
|
|
376
|
+
},
|
|
377
|
+
rationale=rec.rationale,
|
|
378
|
+
source_notebook=rec.source_notebook,
|
|
379
|
+
)
|
|
380
|
+
if rec.action == "winsorize":
|
|
381
|
+
return TransformationStep(
|
|
382
|
+
type=PipelineTransformationType.WINSORIZE,
|
|
383
|
+
column=rec.target_column,
|
|
384
|
+
parameters={
|
|
385
|
+
"lower_bound": rec.parameters.get("lower_bound", 0),
|
|
386
|
+
"upper_bound": rec.parameters.get("upper_bound", 1000000),
|
|
387
|
+
},
|
|
388
|
+
rationale=rec.rationale,
|
|
389
|
+
source_notebook=rec.source_notebook,
|
|
390
|
+
)
|
|
391
|
+
return TransformationStep(
|
|
392
|
+
type=PipelineTransformationType.CAP_OUTLIER,
|
|
393
|
+
column=rec.target_column,
|
|
394
|
+
parameters={"method": rec.parameters.get("method", "iqr")},
|
|
395
|
+
rationale=rec.rationale,
|
|
396
|
+
source_notebook=rec.source_notebook,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
def _apply_silver_recommendations(self, config: PipelineConfig, registry: RecommendationRegistry) -> None:
|
|
400
|
+
if not hasattr(registry, 'silver') or registry.silver is None:
|
|
401
|
+
return
|
|
402
|
+
for rec in getattr(registry.silver, 'derived_columns', []):
|
|
403
|
+
step = self._map_silver_derived(rec)
|
|
404
|
+
if step:
|
|
405
|
+
config.silver.derived_columns.append(step)
|
|
406
|
+
|
|
407
|
+
def _map_silver_derived(self, rec) -> Optional[TransformationStep]:
|
|
408
|
+
action = rec.action
|
|
409
|
+
params = dict(rec.parameters)
|
|
410
|
+
if action in ("ratio", "interaction", "composite"):
|
|
411
|
+
return TransformationStep(
|
|
412
|
+
type=PipelineTransformationType.DERIVED_COLUMN,
|
|
413
|
+
column=rec.target_column,
|
|
414
|
+
parameters={"action": action, **params},
|
|
415
|
+
rationale=rec.rationale,
|
|
416
|
+
source_notebook=rec.source_notebook,
|
|
417
|
+
)
|
|
418
|
+
return None
|
|
419
|
+
|
|
420
|
+
def _apply_gold_recommendations(self, config: PipelineConfig, registry: RecommendationRegistry) -> None:
|
|
421
|
+
if not hasattr(registry, 'gold') or registry.gold is None:
|
|
422
|
+
return
|
|
423
|
+
gold = registry.gold
|
|
424
|
+
seen_encoding_columns: Set[str] = {e.column for e in config.gold.encodings}
|
|
425
|
+
for rec in getattr(gold, 'encoding', []):
|
|
426
|
+
if rec.target_column in seen_encoding_columns:
|
|
427
|
+
continue
|
|
428
|
+
seen_encoding_columns.add(rec.target_column)
|
|
429
|
+
method = rec.parameters.get("method", rec.action)
|
|
430
|
+
if method in ("onehot", "one_hot"):
|
|
431
|
+
method = "one_hot"
|
|
432
|
+
config.gold.encodings.append(TransformationStep(
|
|
433
|
+
type=PipelineTransformationType.ENCODE,
|
|
434
|
+
column=rec.target_column,
|
|
435
|
+
parameters={"method": method},
|
|
436
|
+
rationale=rec.rationale,
|
|
437
|
+
source_notebook=rec.source_notebook,
|
|
438
|
+
))
|
|
439
|
+
seen_scaling_columns: Set[str] = {s.column for s in config.gold.scalings}
|
|
440
|
+
for rec in getattr(gold, 'scaling', []):
|
|
441
|
+
if rec.target_column in seen_scaling_columns:
|
|
442
|
+
continue
|
|
443
|
+
seen_scaling_columns.add(rec.target_column)
|
|
444
|
+
config.gold.scalings.append(TransformationStep(
|
|
445
|
+
type=PipelineTransformationType.SCALE,
|
|
446
|
+
column=rec.target_column,
|
|
447
|
+
parameters={"method": rec.parameters.get("method", "standard")},
|
|
448
|
+
rationale=rec.rationale,
|
|
449
|
+
source_notebook=rec.source_notebook,
|
|
450
|
+
))
|
|
451
|
+
for rec in getattr(gold, 'transformations', []):
|
|
452
|
+
step = self._map_gold_transformation(rec)
|
|
453
|
+
if step:
|
|
454
|
+
config.gold.transformations.append(step)
|
|
455
|
+
prioritized_columns = self._collect_prioritized_columns(gold)
|
|
456
|
+
drop_columns = self._collect_feature_selection_drops(gold, prioritized_columns)
|
|
457
|
+
config.gold.feature_selections = list(drop_columns)
|
|
458
|
+
|
|
459
|
+
def _map_gold_transformation(self, rec) -> Optional[TransformationStep]:
|
|
460
|
+
action = rec.action
|
|
461
|
+
type_map = {
|
|
462
|
+
"log": PipelineTransformationType.LOG_TRANSFORM,
|
|
463
|
+
"log_transform": PipelineTransformationType.LOG_TRANSFORM,
|
|
464
|
+
"sqrt": PipelineTransformationType.SQRT_TRANSFORM,
|
|
465
|
+
"sqrt_transform": PipelineTransformationType.SQRT_TRANSFORM,
|
|
466
|
+
"yeo_johnson": PipelineTransformationType.YEO_JOHNSON,
|
|
467
|
+
"zero_inflation_handling": PipelineTransformationType.ZERO_INFLATION_HANDLING,
|
|
468
|
+
"cap_then_log": PipelineTransformationType.CAP_THEN_LOG,
|
|
469
|
+
}
|
|
470
|
+
trans_type = type_map.get(action)
|
|
471
|
+
if trans_type is None:
|
|
472
|
+
return None
|
|
473
|
+
return TransformationStep(
|
|
474
|
+
type=trans_type,
|
|
475
|
+
column=rec.target_column,
|
|
476
|
+
parameters=dict(rec.parameters) if rec.parameters else {},
|
|
477
|
+
rationale=rec.rationale,
|
|
478
|
+
source_notebook=rec.source_notebook,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
def _collect_prioritized_columns(self, gold) -> Set[str]:
|
|
482
|
+
prioritized = set()
|
|
483
|
+
for rec in getattr(gold, 'feature_selection', []):
|
|
484
|
+
if rec.action == "prioritize":
|
|
485
|
+
prioritized.add(rec.target_column)
|
|
486
|
+
return prioritized
|
|
487
|
+
|
|
488
|
+
def _collect_feature_selection_drops(self, gold, prioritized: Set[str]) -> Set[str]:
|
|
489
|
+
drops = set()
|
|
490
|
+
for rec in getattr(gold, 'feature_selection', []):
|
|
491
|
+
if rec.action in ("drop_multicollinear", "drop_weak"):
|
|
492
|
+
if rec.target_column not in prioritized:
|
|
493
|
+
drops.add(rec.target_column)
|
|
494
|
+
return drops
|
|
495
|
+
|
|
496
|
+
@staticmethod
|
|
497
|
+
def _resolve_raw_time_column(findings: ExplorationFindings) -> Optional[str]:
|
|
498
|
+
"""Get the raw data's time column, preferring datetime_columns over metadata.
|
|
499
|
+
|
|
500
|
+
time_series_metadata.time_column may be a post-processing name
|
|
501
|
+
(e.g. feature_timestamp) that doesn't exist in the raw data.
|
|
502
|
+
datetime_columns contains the original column names.
|
|
503
|
+
"""
|
|
504
|
+
ts = findings.time_series_metadata
|
|
505
|
+
metadata_col = ts.time_column if ts else None
|
|
506
|
+
if metadata_col and metadata_col in findings.columns:
|
|
507
|
+
return metadata_col
|
|
508
|
+
if findings.datetime_columns:
|
|
509
|
+
return findings.datetime_columns[0]
|
|
510
|
+
return metadata_col
|
|
511
|
+
|
|
512
|
+
def _build_timestamp_coalesce_config(self, findings: ExplorationFindings) -> Optional[TimestampCoalesceConfig]:
|
|
513
|
+
if len(findings.datetime_ordering) <= 1:
|
|
514
|
+
return None
|
|
515
|
+
output_col = findings.time_series_metadata.time_column if findings.time_series_metadata else "feature_timestamp"
|
|
516
|
+
return TimestampCoalesceConfig(datetime_columns_ordered=findings.datetime_ordering, output_column=output_col)
|
|
517
|
+
|
|
518
|
+
def _build_label_timestamp_config(self, findings: ExplorationFindings) -> Optional[LabelTimestampConfig]:
|
|
519
|
+
if not findings.label_timestamp_column and findings.observation_window_days == 180:
|
|
520
|
+
return None
|
|
521
|
+
return LabelTimestampConfig(
|
|
522
|
+
label_column=findings.label_timestamp_column,
|
|
523
|
+
fallback_window_days=findings.observation_window_days,
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
def _build_landing_configs(self, config: PipelineConfig, multi: MultiDatasetFindings, sources: Dict[str, ExplorationFindings]) -> None:
|
|
527
|
+
for event_name in multi.event_datasets:
|
|
528
|
+
dataset_info = multi.datasets.get(event_name)
|
|
529
|
+
if not dataset_info:
|
|
530
|
+
continue
|
|
531
|
+
findings = sources.get(event_name)
|
|
532
|
+
if not findings:
|
|
533
|
+
continue
|
|
534
|
+
entity_col = (dataset_info.entity_column
|
|
535
|
+
or (findings.time_series_metadata.entity_column if findings.time_series_metadata else None)
|
|
536
|
+
or (findings.identifier_columns[0] if findings.identifier_columns else "id"))
|
|
537
|
+
time_col = (dataset_info.time_column
|
|
538
|
+
or (findings.time_series_metadata.time_column if findings.time_series_metadata else None)
|
|
539
|
+
or "timestamp")
|
|
540
|
+
raw_time_col = self._resolve_raw_time_column(findings)
|
|
541
|
+
raw_source = str(Path(dataset_info.source_path or findings.source_path).resolve())
|
|
542
|
+
source_cfg = next((s for s in config.sources if s.name == event_name), None)
|
|
543
|
+
if not source_cfg:
|
|
544
|
+
continue
|
|
545
|
+
original_target = self._resolve_original_target(findings, config.target_column)
|
|
546
|
+
config.landing[event_name] = LandingLayerConfig(
|
|
547
|
+
source=source_cfg,
|
|
548
|
+
raw_source_path=raw_source,
|
|
549
|
+
raw_source_format=self._infer_format(raw_source),
|
|
550
|
+
entity_column=entity_col,
|
|
551
|
+
time_column=time_col,
|
|
552
|
+
target_column=config.target_column,
|
|
553
|
+
original_target_column=original_target,
|
|
554
|
+
raw_time_column=raw_time_col if raw_time_col and raw_time_col != time_col else None,
|
|
555
|
+
timestamp_coalesce=self._build_timestamp_coalesce_config(findings),
|
|
556
|
+
label_timestamp=self._build_label_timestamp_config(findings),
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
@staticmethod
|
|
560
|
+
def _resolve_original_target(findings: ExplorationFindings, target_column: str) -> Optional[str]:
|
|
561
|
+
original = findings.metadata.get("original_target_column") if findings.metadata else None
|
|
562
|
+
if original and original != target_column:
|
|
563
|
+
return original
|
|
564
|
+
return None
|
|
565
|
+
|
|
566
|
+
def _build_aggregation_config(self, multi: MultiDatasetFindings, findings: ExplorationFindings) -> Optional[AggregationWindowConfig]:
|
|
567
|
+
windows = getattr(multi, 'aggregation_windows', None) or []
|
|
568
|
+
if not windows and findings.time_series_metadata:
|
|
569
|
+
windows = getattr(findings.time_series_metadata, 'suggested_aggregations', []) or []
|
|
570
|
+
if not windows:
|
|
571
|
+
return None
|
|
572
|
+
value_columns = []
|
|
573
|
+
for col_name, col_finding in findings.columns.items():
|
|
574
|
+
col_type = _resolve_col_type(col_finding)
|
|
575
|
+
if col_type in ("numeric_continuous", "numeric_discrete", "numeric", "binary"):
|
|
576
|
+
if col_name not in (findings.target_column or ""):
|
|
577
|
+
value_columns.append(col_name)
|
|
578
|
+
return AggregationWindowConfig(
|
|
579
|
+
windows=windows,
|
|
580
|
+
value_columns=value_columns,
|
|
581
|
+
agg_funcs=["sum", "mean", "max", "count"],
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
def _build_lifecycle_config(self, multi: MultiDatasetFindings) -> Optional[LifecycleConfig]:
|
|
585
|
+
notes = getattr(multi, 'notes', None)
|
|
586
|
+
if not notes:
|
|
587
|
+
return None
|
|
588
|
+
temporal_config = notes.get("temporal_config", {}) if isinstance(notes, dict) else {}
|
|
589
|
+
feature_groups = temporal_config.get("feature_groups", [])
|
|
590
|
+
return LifecycleConfig(
|
|
591
|
+
include_lifecycle_quadrant="lifecycle" in feature_groups,
|
|
592
|
+
include_cyclical_features="regularity" in feature_groups,
|
|
593
|
+
include_recency_bucket="recency" in feature_groups,
|
|
594
|
+
momentum_pairs=[],
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
def _build_bronze_event_configs(
|
|
598
|
+
self,
|
|
599
|
+
config: PipelineConfig,
|
|
600
|
+
multi: MultiDatasetFindings,
|
|
601
|
+
source_findings: Dict[str, ExplorationFindings],
|
|
602
|
+
discovered_events: Optional[Dict[str, ExplorationFindings]] = None,
|
|
603
|
+
) -> None:
|
|
604
|
+
lifecycle_config = self._build_lifecycle_config(multi)
|
|
605
|
+
for event_name in multi.event_datasets:
|
|
606
|
+
findings = source_findings.get(event_name)
|
|
607
|
+
if not findings:
|
|
608
|
+
continue
|
|
609
|
+
source_cfg = next((s for s in config.sources if s.name == event_name), None)
|
|
610
|
+
if not source_cfg:
|
|
611
|
+
continue
|
|
612
|
+
dataset_info = multi.datasets.get(event_name)
|
|
613
|
+
entity_col = (dataset_info.entity_column if dataset_info else None) or source_cfg.entity_key
|
|
614
|
+
time_col = (dataset_info.time_column if dataset_info else None) or source_cfg.time_column or "timestamp"
|
|
615
|
+
raw_time_col = self._resolve_raw_time_column(findings)
|
|
616
|
+
config.bronze_event[event_name] = BronzeEventConfig(
|
|
617
|
+
source=source_cfg, entity_column=entity_col, time_column=time_col,
|
|
618
|
+
deduplicate=True,
|
|
619
|
+
pre_shaping=self._extract_transformations(findings),
|
|
620
|
+
aggregation=self._build_aggregation_config(multi, findings),
|
|
621
|
+
lifecycle=lifecycle_config,
|
|
622
|
+
raw_time_column=raw_time_col if raw_time_col and raw_time_col != time_col else None,
|
|
623
|
+
)
|
|
624
|
+
for agg_name, preagg in (discovered_events or {}).items():
|
|
625
|
+
if agg_name in config.bronze_event:
|
|
626
|
+
continue
|
|
627
|
+
source_cfg = next((s for s in config.sources if s.name == agg_name), None)
|
|
628
|
+
if not source_cfg:
|
|
629
|
+
continue
|
|
630
|
+
ts = preagg.time_series_metadata
|
|
631
|
+
entity_col = (ts.entity_column if ts else None) or source_cfg.entity_key
|
|
632
|
+
time_col = (ts.time_column if ts else None) or source_cfg.time_column or "timestamp"
|
|
633
|
+
raw_time_col = self._resolve_raw_time_column(preagg)
|
|
634
|
+
config.bronze_event[agg_name] = BronzeEventConfig(
|
|
635
|
+
source=source_cfg, entity_column=entity_col, time_column=time_col,
|
|
636
|
+
deduplicate=True,
|
|
637
|
+
pre_shaping=self._extract_transformations(preagg),
|
|
638
|
+
aggregation=self._build_aggregation_config(multi, preagg),
|
|
639
|
+
lifecycle=lifecycle_config,
|
|
640
|
+
raw_time_column=raw_time_col if raw_time_col and raw_time_col != time_col else None,
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
def _discover_event_sources(self, source_findings: Dict[str, ExplorationFindings]) -> Dict[str, ExplorationFindings]:
|
|
644
|
+
index = self._build_aggregated_path_index()
|
|
645
|
+
if not index:
|
|
646
|
+
return {}
|
|
647
|
+
return self._scan_for_preagg_findings(index)
|
|
648
|
+
|
|
649
|
+
def _build_aggregated_path_index(self) -> Dict[Path, str]:
|
|
650
|
+
return {path: name for name, path in self._source_findings_paths.items()}
|
|
651
|
+
|
|
652
|
+
def _scan_for_preagg_findings(self, index: Dict[Path, str]) -> Dict[str, ExplorationFindings]:
|
|
653
|
+
loaded_paths = set(self._source_findings_paths.values())
|
|
654
|
+
result: Dict[str, ExplorationFindings] = {}
|
|
655
|
+
for candidate in self._findings_dir.glob("*_findings.yaml"):
|
|
656
|
+
resolved = candidate.resolve()
|
|
657
|
+
if resolved in loaded_paths:
|
|
658
|
+
continue
|
|
659
|
+
if candidate.name == "multi_dataset_findings.yaml":
|
|
660
|
+
continue
|
|
661
|
+
try:
|
|
662
|
+
preagg = ExplorationFindings.load(str(candidate))
|
|
663
|
+
except Exception:
|
|
664
|
+
continue
|
|
665
|
+
source_name = self._match_preagg_to_source(preagg, index)
|
|
666
|
+
if source_name is not None:
|
|
667
|
+
result[source_name] = preagg
|
|
668
|
+
return result
|
|
669
|
+
|
|
670
|
+
def _match_preagg_to_source(self, preagg: ExplorationFindings, index: Dict[Path, str]) -> Optional[str]:
|
|
671
|
+
if not preagg.has_aggregated_output:
|
|
672
|
+
return None
|
|
673
|
+
agg_path_str = preagg.time_series_metadata.aggregated_findings_path
|
|
674
|
+
if not agg_path_str:
|
|
675
|
+
return None
|
|
676
|
+
agg_path = Path(agg_path_str).resolve()
|
|
677
|
+
return index.get(agg_path)
|
|
678
|
+
|
|
679
|
+
def _build_discovered_landing_configs(
|
|
680
|
+
self,
|
|
681
|
+
config: PipelineConfig,
|
|
682
|
+
discovered: Dict[str, ExplorationFindings],
|
|
683
|
+
multi: MultiDatasetFindings,
|
|
684
|
+
) -> None:
|
|
685
|
+
for agg_name, preagg in discovered.items():
|
|
686
|
+
if agg_name in config.landing:
|
|
687
|
+
continue
|
|
688
|
+
source_cfg = next((s for s in config.sources if s.name == agg_name), None)
|
|
689
|
+
if not source_cfg:
|
|
690
|
+
continue
|
|
691
|
+
ts = preagg.time_series_metadata
|
|
692
|
+
entity_col = (ts.entity_column if ts else None) or source_cfg.entity_key
|
|
693
|
+
time_col = (ts.time_column if ts else None) or "timestamp"
|
|
694
|
+
raw_time_col = self._resolve_raw_time_column(preagg)
|
|
695
|
+
source_cfg.is_event_level = True
|
|
696
|
+
source_cfg.time_column = time_col
|
|
697
|
+
source_cfg.entity_key = entity_col
|
|
698
|
+
raw_source = str(Path(preagg.source_path).resolve())
|
|
699
|
+
original_target = self._resolve_original_target(preagg, config.target_column)
|
|
700
|
+
config.landing[agg_name] = LandingLayerConfig(
|
|
701
|
+
source=source_cfg,
|
|
702
|
+
raw_source_path=raw_source,
|
|
703
|
+
raw_source_format=self._infer_format(raw_source),
|
|
704
|
+
entity_column=entity_col,
|
|
705
|
+
time_column=time_col,
|
|
706
|
+
target_column=config.target_column,
|
|
707
|
+
original_target_column=original_target,
|
|
708
|
+
raw_time_column=raw_time_col if raw_time_col and raw_time_col != time_col else None,
|
|
709
|
+
timestamp_coalesce=self._build_timestamp_coalesce_config(preagg),
|
|
710
|
+
label_timestamp=self._build_label_timestamp_config(preagg),
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
@staticmethod
|
|
714
|
+
def _reconcile_discovered_event_transforms(config: "PipelineConfig", discovered_events: Dict[str, ExplorationFindings]) -> None:
|
|
715
|
+
if not discovered_events:
|
|
716
|
+
return
|
|
717
|
+
for name in list(discovered_events.keys()):
|
|
718
|
+
if name in config.bronze and name in config.bronze_event:
|
|
719
|
+
config.bronze_event[name].post_shaping.extend(config.bronze[name].transformations)
|
|
720
|
+
del config.bronze[name]
|
|
721
|
+
|
|
722
|
+
@staticmethod
|
|
723
|
+
def _infer_format(path: str) -> str:
|
|
724
|
+
ext = Path(path).suffix.lower()
|
|
725
|
+
if ext == ".csv":
|
|
726
|
+
return "csv"
|
|
727
|
+
return "parquet"
|