churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
import nbformat
|
|
5
|
+
|
|
6
|
+
from ..base import NotebookStage
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from customer_retention.analysis.auto_explorer import ExplorationFindings
|
|
10
|
+
from ..cell_builder import CellBuilder
|
|
11
|
+
from ..config import NotebookConfig, Platform
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class StageGenerator(ABC):
|
|
15
|
+
def __init__(self, config: NotebookConfig, findings: Optional["ExplorationFindings"]):
|
|
16
|
+
self.config = config
|
|
17
|
+
self.findings = findings
|
|
18
|
+
self.cb = CellBuilder
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def stage(self) -> NotebookStage:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def title(self) -> str:
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def description(self) -> str:
|
|
32
|
+
return ""
|
|
33
|
+
|
|
34
|
+
def generate(self, platform: Platform) -> List[nbformat.NotebookNode]:
|
|
35
|
+
if platform == Platform.LOCAL:
|
|
36
|
+
return self.generate_local_cells()
|
|
37
|
+
return self.generate_databricks_cells()
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def generate_local_cells(self) -> List[nbformat.NotebookNode]:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def generate_databricks_cells(self) -> List[nbformat.NotebookNode]:
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
def header_cells(self) -> List[nbformat.NotebookNode]:
|
|
48
|
+
cells = [self.cb.header(self.title)]
|
|
49
|
+
if self.description:
|
|
50
|
+
cells.append(self.cb.markdown(self.description))
|
|
51
|
+
return cells
|
|
52
|
+
|
|
53
|
+
def get_target_column(self) -> str:
|
|
54
|
+
if self.findings and hasattr(self.findings, "target_column") and self.findings.target_column:
|
|
55
|
+
return self.findings.target_column
|
|
56
|
+
return "target"
|
|
57
|
+
|
|
58
|
+
def get_identifier_columns(self) -> List[str]:
|
|
59
|
+
if self.findings and hasattr(self.findings, "identifier_columns") and self.findings.identifier_columns:
|
|
60
|
+
return self.findings.identifier_columns
|
|
61
|
+
return ["customer_id"]
|
|
62
|
+
|
|
63
|
+
def get_feature_columns(self) -> List[str]:
|
|
64
|
+
if not self.findings or not hasattr(self.findings, "columns"):
|
|
65
|
+
return []
|
|
66
|
+
from customer_retention.core.config import ColumnType
|
|
67
|
+
feature_types = {ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE,
|
|
68
|
+
ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL, ColumnType.BINARY}
|
|
69
|
+
return [name for name, col in self.findings.columns.items()
|
|
70
|
+
if hasattr(col, "inferred_type") and col.inferred_type in feature_types]
|
|
71
|
+
|
|
72
|
+
def get_numeric_columns(self) -> List[str]:
|
|
73
|
+
if not self.findings or not hasattr(self.findings, "columns"):
|
|
74
|
+
return []
|
|
75
|
+
from customer_retention.core.config import ColumnType
|
|
76
|
+
return [name for name, col in self.findings.columns.items()
|
|
77
|
+
if hasattr(col, "inferred_type") and col.inferred_type in
|
|
78
|
+
{ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE}]
|
|
79
|
+
|
|
80
|
+
def get_categorical_columns(self) -> List[str]:
|
|
81
|
+
if not self.findings or not hasattr(self.findings, "columns"):
|
|
82
|
+
return []
|
|
83
|
+
from customer_retention.core.config import ColumnType
|
|
84
|
+
return [name for name, col in self.findings.columns.items()
|
|
85
|
+
if hasattr(col, "inferred_type") and col.inferred_type in
|
|
86
|
+
{ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL}]
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import nbformat
|
|
4
|
+
|
|
5
|
+
from ..base import NotebookStage
|
|
6
|
+
from .base_stage import StageGenerator
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class IngestionStage(StageGenerator):
|
|
10
|
+
@property
|
|
11
|
+
def stage(self) -> NotebookStage:
|
|
12
|
+
return NotebookStage.INGESTION
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def title(self) -> str:
|
|
16
|
+
return "01 - Configuration & Data Ingestion"
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def description(self) -> str:
|
|
20
|
+
return "Load raw data, configure pipeline context, and save to bronze layer."
|
|
21
|
+
|
|
22
|
+
def generate_local_cells(self) -> List[nbformat.NotebookNode]:
|
|
23
|
+
findings_path = self.findings.source_path if self.findings else "./data/customers.csv"
|
|
24
|
+
source_format = getattr(self.findings, "source_format", "csv") if self.findings else "csv"
|
|
25
|
+
return self.header_cells() + [
|
|
26
|
+
self.cb.section("Imports"),
|
|
27
|
+
self.cb.from_imports_cell({
|
|
28
|
+
"customer_retention.generators.orchestration": ["setup_notebook_context", "PipelineContext"],
|
|
29
|
+
"customer_retention.stages.ingestion": ["DataSourceRegistry"],
|
|
30
|
+
"customer_retention.analysis.auto_explorer": ["ExplorationFindings"],
|
|
31
|
+
"customer_retention.stages.temporal": ["ScenarioDetector", "UnifiedDataPreparer"],
|
|
32
|
+
"datetime": ["datetime"],
|
|
33
|
+
"pathlib": ["Path"],
|
|
34
|
+
}),
|
|
35
|
+
self.cb.section("Configuration"),
|
|
36
|
+
self.cb.code(f'''FINDINGS_PATH = "{findings_path}"
|
|
37
|
+
DATA_FORMAT = "{source_format}"
|
|
38
|
+
OUTPUT_DIR = Path("./experiments/data")'''),
|
|
39
|
+
self.cb.section("Load Exploration Findings"),
|
|
40
|
+
self.cb.code('''findings = ExplorationFindings.load(FINDINGS_PATH)
|
|
41
|
+
print(f"Loaded findings: {findings.row_count} rows, {findings.column_count} columns")
|
|
42
|
+
print(f"Target column: {findings.target_column}")'''),
|
|
43
|
+
self.cb.section("Setup Pipeline Context"),
|
|
44
|
+
self.cb.code('''ctx, manager = setup_notebook_context(exploration_findings=findings)
|
|
45
|
+
print(f"Pipeline context initialized for: {ctx.config.project_name}")'''),
|
|
46
|
+
self.cb.section("Load Raw Data"),
|
|
47
|
+
self.cb.code('''registry = DataSourceRegistry()
|
|
48
|
+
df = registry.load(findings.source_path, format=DATA_FORMAT)
|
|
49
|
+
print(f"Loaded {len(df)} rows")
|
|
50
|
+
df.head()'''),
|
|
51
|
+
self.cb.section("Detect Timestamp Scenario"),
|
|
52
|
+
self.cb.code('''detector = ScenarioDetector()
|
|
53
|
+
scenario, ts_config, discovery_result = detector.detect(df, findings.target_column)
|
|
54
|
+
print(f"Detected scenario: {scenario}")
|
|
55
|
+
print(f"Strategy: {ts_config.strategy.value}")
|
|
56
|
+
print(f"Recommendation: {discovery_result.recommendation}")'''),
|
|
57
|
+
self.cb.section("Prepare Data with Timestamps"),
|
|
58
|
+
self.cb.code('''preparer = UnifiedDataPreparer(OUTPUT_DIR, ts_config)
|
|
59
|
+
unified_df = preparer.prepare_from_raw(
|
|
60
|
+
df,
|
|
61
|
+
target_column=findings.target_column,
|
|
62
|
+
entity_column=findings.entity_id_column or "custid"
|
|
63
|
+
)
|
|
64
|
+
print(f"Prepared {len(unified_df)} rows with timestamps")
|
|
65
|
+
print(f"Timestamp columns: feature_timestamp, label_timestamp, label_available_flag")'''),
|
|
66
|
+
self.cb.section("Create Training Snapshot"),
|
|
67
|
+
self.cb.code('''cutoff_date = datetime.now()
|
|
68
|
+
snapshot_df, metadata = preparer.create_training_snapshot(unified_df, cutoff_date)
|
|
69
|
+
print(f"Created snapshot: {metadata['snapshot_id']}")
|
|
70
|
+
print(f"Rows: {metadata['row_count']}")
|
|
71
|
+
print(f"Features: {len(metadata['feature_columns'])}")'''),
|
|
72
|
+
self.cb.section("Save Processed Data"),
|
|
73
|
+
self.cb.code('''manager.update(current_df=snapshot_df, current_stage="bronze")
|
|
74
|
+
print(f"Pipeline context updated. Use snapshot '{metadata['snapshot_id']}' for training.")'''),
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
def generate_databricks_cells(self) -> List[nbformat.NotebookNode]:
|
|
78
|
+
catalog = self.config.feature_store.catalog
|
|
79
|
+
schema = self.config.feature_store.schema
|
|
80
|
+
data_path = self.findings.source_path if self.findings else "/mnt/landing/customers"
|
|
81
|
+
source_format = getattr(self.findings, "source_format", "csv") if self.findings else "csv"
|
|
82
|
+
return self.header_cells() + [
|
|
83
|
+
self.cb.section("Configuration"),
|
|
84
|
+
self.cb.code(f'''CATALOG = "{catalog}"
|
|
85
|
+
SCHEMA = "{schema}"
|
|
86
|
+
DATA_PATH = "{data_path}"
|
|
87
|
+
spark.sql(f"USE CATALOG {{CATALOG}}")
|
|
88
|
+
spark.sql(f"USE SCHEMA {{SCHEMA}}")'''),
|
|
89
|
+
self.cb.section("Load Raw Data"),
|
|
90
|
+
self.cb.code(f'''df_raw = (spark.read
|
|
91
|
+
.format("{source_format}")
|
|
92
|
+
.option("header", "true")
|
|
93
|
+
.option("inferSchema", "true")
|
|
94
|
+
.load(DATA_PATH))
|
|
95
|
+
print(f"Loaded {{df_raw.count()}} rows")
|
|
96
|
+
display(df_raw.limit(10))'''),
|
|
97
|
+
self.cb.section("Save to Bronze Table"),
|
|
98
|
+
self.cb.code('''df_raw.write.format("delta").mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.bronze_customers")
|
|
99
|
+
print("Bronze table created")'''),
|
|
100
|
+
]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import nbformat
|
|
4
|
+
|
|
5
|
+
from ..base import NotebookStage
|
|
6
|
+
from .base_stage import StageGenerator
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ProfilingStage(StageGenerator):
|
|
10
|
+
@property
|
|
11
|
+
def stage(self) -> NotebookStage:
|
|
12
|
+
return NotebookStage.PROFILING
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def title(self) -> str:
|
|
16
|
+
return "02 - Data Profiling"
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def description(self) -> str:
|
|
20
|
+
return "Generate column statistics, type detection, and quality metrics."
|
|
21
|
+
|
|
22
|
+
def generate_local_cells(self) -> List[nbformat.NotebookNode]:
|
|
23
|
+
return self.header_cells() + [
|
|
24
|
+
self.cb.section("Imports"),
|
|
25
|
+
self.cb.from_imports_cell({
|
|
26
|
+
"customer_retention.stages.profiling": ["TypeDetector", "ProfilerFactory", "QualityCheckRegistry"],
|
|
27
|
+
"customer_retention.analysis.visualization": ["ChartBuilder"],
|
|
28
|
+
"pandas": ["pd"],
|
|
29
|
+
}),
|
|
30
|
+
self.cb.section("Load Bronze Data"),
|
|
31
|
+
self.cb.code('''from customer_retention.integrations.adapters.factory import get_delta
|
|
32
|
+
storage = get_delta(force_local=True)
|
|
33
|
+
df = storage.read("./experiments/data/bronze/customers")
|
|
34
|
+
print(f"Loaded {len(df)} rows, {len(df.columns)} columns")'''),
|
|
35
|
+
self.cb.section("Type Detection"),
|
|
36
|
+
self.cb.code('''detector = TypeDetector()
|
|
37
|
+
type_results = {col: detector.detect(df[col]) for col in df.columns}
|
|
38
|
+
for col, result in type_results.items():
|
|
39
|
+
print(f"{col}: {result.column_type.value} (confidence: {result.confidence:.2f})")'''),
|
|
40
|
+
self.cb.section("Column Profiling"),
|
|
41
|
+
self.cb.code('''factory = ProfilerFactory()
|
|
42
|
+
profiles = {}
|
|
43
|
+
for col in df.columns:
|
|
44
|
+
profiler = factory.get_profiler(type_results[col].column_type)
|
|
45
|
+
profiles[col] = profiler.profile(df[col])'''),
|
|
46
|
+
self.cb.section("Quality Checks"),
|
|
47
|
+
self.cb.code('''registry = QualityCheckRegistry()
|
|
48
|
+
checks = registry.get_all_checks()
|
|
49
|
+
results = []
|
|
50
|
+
for check in checks:
|
|
51
|
+
for col in df.columns:
|
|
52
|
+
result = check.check(df[col], profiles.get(col))
|
|
53
|
+
if result.passed is False:
|
|
54
|
+
results.append({"column": col, "check": check.name, "severity": result.severity.value, "message": result.message})
|
|
55
|
+
quality_df = pd.DataFrame(results)
|
|
56
|
+
quality_df'''),
|
|
57
|
+
self.cb.section("Visualize Quality"),
|
|
58
|
+
self.cb.code('''charts = ChartBuilder()
|
|
59
|
+
if len(quality_df) > 0:
|
|
60
|
+
fig = charts.quality_heatmap(quality_df)
|
|
61
|
+
fig.show()'''),
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
def generate_databricks_cells(self) -> List[nbformat.NotebookNode]:
|
|
65
|
+
catalog = self.config.feature_store.catalog
|
|
66
|
+
schema = self.config.feature_store.schema
|
|
67
|
+
return self.header_cells() + [
|
|
68
|
+
self.cb.section("Load Bronze Data"),
|
|
69
|
+
self.cb.code(f'''df = spark.table("{catalog}.{schema}.bronze_customers")
|
|
70
|
+
print(f"Loaded {{df.count()}} rows")'''),
|
|
71
|
+
self.cb.section("Basic Statistics"),
|
|
72
|
+
self.cb.code('''summary = df.describe()
|
|
73
|
+
display(summary)'''),
|
|
74
|
+
self.cb.section("Column Types and Nulls"),
|
|
75
|
+
self.cb.code('''from pyspark.sql.functions import col, count, when, isnan
|
|
76
|
+
|
|
77
|
+
null_counts = df.select([
|
|
78
|
+
count(when(col(c).isNull() | isnan(col(c)), c)).alias(c)
|
|
79
|
+
for c in df.columns
|
|
80
|
+
])
|
|
81
|
+
display(null_counts)'''),
|
|
82
|
+
self.cb.section("Distinct Values"),
|
|
83
|
+
self.cb.code('''from pyspark.sql.functions import countDistinct
|
|
84
|
+
|
|
85
|
+
distinct_counts = df.select([countDistinct(col(c)).alias(c) for c in df.columns])
|
|
86
|
+
display(distinct_counts)'''),
|
|
87
|
+
self.cb.section("Save Profiling Results"),
|
|
88
|
+
self.cb.code('''profile_data = {
|
|
89
|
+
"columns": df.columns,
|
|
90
|
+
"dtypes": [str(f.dataType) for f in df.schema.fields],
|
|
91
|
+
"row_count": df.count()
|
|
92
|
+
}
|
|
93
|
+
import json
|
|
94
|
+
dbutils.fs.put("/tmp/profile_results.json", json.dumps(profile_data), overwrite=True)'''),
|
|
95
|
+
]
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import nbformat
|
|
4
|
+
|
|
5
|
+
from ..base import NotebookStage
|
|
6
|
+
from .base_stage import StageGenerator
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CleaningStage(StageGenerator):
|
|
10
|
+
@property
|
|
11
|
+
def stage(self) -> NotebookStage:
|
|
12
|
+
return NotebookStage.CLEANING
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def title(self) -> str:
|
|
16
|
+
return "03 - Data Cleaning"
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def description(self) -> str:
|
|
20
|
+
return "Handle missing values and outliers based on column types with MLflow tracking."
|
|
21
|
+
|
|
22
|
+
def _get_cleaning_recommendations(self) -> dict:
|
|
23
|
+
recommendations = {}
|
|
24
|
+
if not self.findings or not hasattr(self.findings, "columns"):
|
|
25
|
+
return recommendations
|
|
26
|
+
for col_name, col_finding in self.findings.columns.items():
|
|
27
|
+
if hasattr(col_finding, "cleaning_recommendations") and col_finding.cleaning_recommendations:
|
|
28
|
+
recommendations[col_name] = col_finding.cleaning_recommendations
|
|
29
|
+
return recommendations
|
|
30
|
+
|
|
31
|
+
def generate_local_cells(self) -> List[nbformat.NotebookNode]:
|
|
32
|
+
numeric_cols = self.get_numeric_columns()
|
|
33
|
+
categorical_cols = self.get_categorical_columns()
|
|
34
|
+
tracking_uri = self.config.mlflow.tracking_uri
|
|
35
|
+
exp_name = self.config.mlflow.experiment_name
|
|
36
|
+
cleaning_recs = self._get_cleaning_recommendations()
|
|
37
|
+
|
|
38
|
+
cells = self.header_cells() + [
|
|
39
|
+
self.cb.section("Imports"),
|
|
40
|
+
self.cb.from_imports_cell({
|
|
41
|
+
"customer_retention.stages.cleaning": ["MissingValueHandler", "OutlierHandler"],
|
|
42
|
+
"customer_retention.integrations.adapters": ["get_mlflow"],
|
|
43
|
+
"pandas": ["pd"],
|
|
44
|
+
}),
|
|
45
|
+
self.cb.section("Setup MLflow Tracking"),
|
|
46
|
+
self.cb.code(f'''mlflow_adapter = get_mlflow(tracking_uri="{tracking_uri}", force_local=True)
|
|
47
|
+
mlflow_adapter.start_run("{exp_name}", run_name="03_data_cleaning")
|
|
48
|
+
cleaning_stats = {{}}'''),
|
|
49
|
+
self.cb.section("Load Bronze Data"),
|
|
50
|
+
self.cb.code('''from customer_retention.integrations.adapters.factory import get_delta
|
|
51
|
+
storage = get_delta(force_local=True)
|
|
52
|
+
df = storage.read("./experiments/data/bronze/customers")
|
|
53
|
+
initial_shape = df.shape
|
|
54
|
+
initial_nulls = df.isnull().sum().sum()
|
|
55
|
+
print(f"Initial shape: {df.shape}")
|
|
56
|
+
print(f"Total missing values: {initial_nulls}")
|
|
57
|
+
|
|
58
|
+
mlflow_adapter.log_metrics({
|
|
59
|
+
"bronze_rows": initial_shape[0],
|
|
60
|
+
"bronze_columns": initial_shape[1],
|
|
61
|
+
"bronze_total_nulls": initial_nulls,
|
|
62
|
+
})'''),
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
if cleaning_recs:
|
|
66
|
+
cells.append(self.cb.section("Apply Cleaning from Exploration Findings"))
|
|
67
|
+
cells.append(self.cb.code(f'''cleaning_recommendations = {cleaning_recs}
|
|
68
|
+
print(f"Found cleaning recommendations for {{len(cleaning_recommendations)}} columns")'''))
|
|
69
|
+
|
|
70
|
+
cells.extend([
|
|
71
|
+
self.cb.section("Handle Missing Values - Numeric Columns"),
|
|
72
|
+
self.cb.code(f'''numeric_cols = {numeric_cols}
|
|
73
|
+
missing_handler = MissingValueHandler(strategy="median")
|
|
74
|
+
for col in numeric_cols:
|
|
75
|
+
if col in df.columns and df[col].isnull().any():
|
|
76
|
+
nulls_before = df[col].isnull().sum()
|
|
77
|
+
df[col] = missing_handler.fit_transform(df[col])
|
|
78
|
+
cleaning_stats[f"{{col}}_nulls_imputed"] = nulls_before
|
|
79
|
+
print(f"Imputed {{col}}: {{nulls_before}} missing values")'''),
|
|
80
|
+
self.cb.section("Handle Missing Values - Categorical Columns"),
|
|
81
|
+
self.cb.code(f'''categorical_cols = {categorical_cols}
|
|
82
|
+
missing_handler_cat = MissingValueHandler(strategy="mode")
|
|
83
|
+
for col in categorical_cols:
|
|
84
|
+
if col in df.columns and df[col].isnull().any():
|
|
85
|
+
nulls_before = df[col].isnull().sum()
|
|
86
|
+
df[col] = missing_handler_cat.fit_transform(df[col])
|
|
87
|
+
cleaning_stats[f"{{col}}_nulls_imputed"] = nulls_before
|
|
88
|
+
print(f"Imputed {{col}}: {{nulls_before}} missing values")'''),
|
|
89
|
+
self.cb.section("Handle Outliers"),
|
|
90
|
+
self.cb.code('''outlier_handler = OutlierHandler(method="iqr", treatment="cap")
|
|
91
|
+
for col in numeric_cols:
|
|
92
|
+
if col in df.columns:
|
|
93
|
+
q1, q3 = df[col].quantile([0.25, 0.75])
|
|
94
|
+
iqr = q3 - q1
|
|
95
|
+
outliers = ((df[col] < q1 - 1.5*iqr) | (df[col] > q3 + 1.5*iqr)).sum()
|
|
96
|
+
cleaning_stats[f"{col}_outliers_capped"] = outliers
|
|
97
|
+
df[col] = outlier_handler.fit_transform(df[col])
|
|
98
|
+
print("Outliers capped using IQR method")'''),
|
|
99
|
+
self.cb.section("Log Cleaning Statistics to MLflow"),
|
|
100
|
+
self.cb.code('''final_nulls = df.isnull().sum().sum()
|
|
101
|
+
mlflow_adapter.log_params({
|
|
102
|
+
"numeric_strategy": "median",
|
|
103
|
+
"categorical_strategy": "mode",
|
|
104
|
+
"outlier_method": "iqr",
|
|
105
|
+
"outlier_treatment": "cap",
|
|
106
|
+
})
|
|
107
|
+
mlflow_adapter.log_metrics({
|
|
108
|
+
"silver_rows": df.shape[0],
|
|
109
|
+
"silver_columns": df.shape[1],
|
|
110
|
+
"silver_total_nulls": final_nulls,
|
|
111
|
+
"nulls_removed": initial_nulls - final_nulls,
|
|
112
|
+
**{k: v for k, v in cleaning_stats.items() if isinstance(v, (int, float))}
|
|
113
|
+
})
|
|
114
|
+
print(f"Logged {len(cleaning_stats)} cleaning statistics to MLflow")'''),
|
|
115
|
+
self.cb.section("Save to Silver Layer"),
|
|
116
|
+
self.cb.code('''storage.write(df, "./experiments/data/silver/customers_cleaned")
|
|
117
|
+
mlflow_adapter.end_run()
|
|
118
|
+
print(f"Silver layer saved: {df.shape}")'''),
|
|
119
|
+
])
|
|
120
|
+
return cells
|
|
121
|
+
|
|
122
|
+
def generate_databricks_cells(self) -> List[nbformat.NotebookNode]:
|
|
123
|
+
catalog = self.config.feature_store.catalog
|
|
124
|
+
schema = self.config.feature_store.schema
|
|
125
|
+
exp_name = self.config.mlflow.experiment_name
|
|
126
|
+
numeric_cols = self.get_numeric_columns()
|
|
127
|
+
categorical_cols = self.get_categorical_columns()
|
|
128
|
+
return self.header_cells() + [
|
|
129
|
+
self.cb.section("Setup MLflow Tracking"),
|
|
130
|
+
self.cb.code(f'''import mlflow
|
|
131
|
+
|
|
132
|
+
mlflow.set_experiment("/Users/{{spark.conf.get('spark.databricks.notebook.username', 'default')}}/{exp_name}")
|
|
133
|
+
mlflow.start_run(run_name="03_data_cleaning")
|
|
134
|
+
cleaning_stats = {{}}'''),
|
|
135
|
+
self.cb.section("Load Bronze Data"),
|
|
136
|
+
self.cb.code(f'''df = spark.table("{catalog}.{schema}.bronze_customers")
|
|
137
|
+
initial_count = df.count()
|
|
138
|
+
print(f"Initial count: {{initial_count}}")
|
|
139
|
+
mlflow.log_metric("bronze_rows", initial_count)'''),
|
|
140
|
+
self.cb.section("Handle Missing Values - Numeric Columns"),
|
|
141
|
+
self.cb.code(f'''from pyspark.sql.functions import col, when, lit, sum as spark_sum
|
|
142
|
+
from pyspark.ml.feature import Imputer
|
|
143
|
+
|
|
144
|
+
numeric_cols = {numeric_cols}
|
|
145
|
+
imputer = Imputer(inputCols=numeric_cols, outputCols=numeric_cols, strategy="median")
|
|
146
|
+
df = imputer.fit(df).transform(df)
|
|
147
|
+
mlflow.log_param("numeric_strategy", "median")
|
|
148
|
+
print("Numeric columns imputed with median")'''),
|
|
149
|
+
self.cb.section("Handle Missing Values - Categorical Columns"),
|
|
150
|
+
self.cb.code(f'''categorical_cols = {categorical_cols}
|
|
151
|
+
for col_name in categorical_cols:
|
|
152
|
+
mode_val = df.groupBy(col_name).count().orderBy("count", ascending=False).first()[0]
|
|
153
|
+
df = df.fillna({{col_name: mode_val}})
|
|
154
|
+
mlflow.log_param("categorical_strategy", "mode")
|
|
155
|
+
print("Categorical columns imputed with mode")'''),
|
|
156
|
+
self.cb.section("Handle Outliers with IQR"),
|
|
157
|
+
self.cb.code('''for col_name in numeric_cols:
|
|
158
|
+
quantiles = df.approxQuantile(col_name, [0.25, 0.75], 0.05)
|
|
159
|
+
if len(quantiles) == 2:
|
|
160
|
+
q1, q3 = quantiles
|
|
161
|
+
iqr = q3 - q1
|
|
162
|
+
lower = q1 - 1.5 * iqr
|
|
163
|
+
upper = q3 + 1.5 * iqr
|
|
164
|
+
df = df.withColumn(col_name, when(col(col_name) < lower, lower)
|
|
165
|
+
.when(col(col_name) > upper, upper)
|
|
166
|
+
.otherwise(col(col_name)))
|
|
167
|
+
mlflow.log_params({"outlier_method": "iqr", "outlier_treatment": "cap"})
|
|
168
|
+
print("Outliers capped using IQR")'''),
|
|
169
|
+
self.cb.section("Log Cleaning Statistics"),
|
|
170
|
+
self.cb.code('''final_count = df.count()
|
|
171
|
+
mlflow.log_metrics({
|
|
172
|
+
"silver_rows": final_count,
|
|
173
|
+
"rows_preserved_pct": final_count / initial_count * 100,
|
|
174
|
+
})
|
|
175
|
+
print(f"Final count: {final_count}")'''),
|
|
176
|
+
self.cb.section("Save to Silver Table"),
|
|
177
|
+
self.cb.code(f'''df.write.format("delta").mode("overwrite").saveAsTable("{catalog}.{schema}.silver_customers")
|
|
178
|
+
mlflow.end_run()
|
|
179
|
+
print("Silver table created")'''),
|
|
180
|
+
]
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import nbformat
|
|
4
|
+
|
|
5
|
+
from ..base import NotebookStage
|
|
6
|
+
from .base_stage import StageGenerator
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TransformationStage(StageGenerator):
|
|
10
|
+
@property
|
|
11
|
+
def stage(self) -> NotebookStage:
|
|
12
|
+
return NotebookStage.TRANSFORMATION
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def title(self) -> str:
|
|
16
|
+
return "04 - Data Transformation"
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def description(self) -> str:
|
|
20
|
+
return "Apply scaling, encoding, and type transformations with MLflow tracking."
|
|
21
|
+
|
|
22
|
+
def _get_transform_recommendations(self) -> dict:
|
|
23
|
+
recommendations = {}
|
|
24
|
+
if not self.findings or not hasattr(self.findings, "columns"):
|
|
25
|
+
return recommendations
|
|
26
|
+
for col_name, col_finding in self.findings.columns.items():
|
|
27
|
+
if hasattr(col_finding, "transformation_recommendations") and col_finding.transformation_recommendations:
|
|
28
|
+
recommendations[col_name] = col_finding.transformation_recommendations
|
|
29
|
+
return recommendations
|
|
30
|
+
|
|
31
|
+
def generate_local_cells(self) -> List[nbformat.NotebookNode]:
|
|
32
|
+
numeric_cols = self.get_numeric_columns()
|
|
33
|
+
categorical_cols = self.get_categorical_columns()
|
|
34
|
+
tracking_uri = self.config.mlflow.tracking_uri
|
|
35
|
+
exp_name = self.config.mlflow.experiment_name
|
|
36
|
+
transform_recs = self._get_transform_recommendations()
|
|
37
|
+
|
|
38
|
+
cells = self.header_cells() + [
|
|
39
|
+
self.cb.section("Imports"),
|
|
40
|
+
self.cb.from_imports_cell({
|
|
41
|
+
"customer_retention.stages.transformation": ["NumericTransformer", "CategoricalEncoder"],
|
|
42
|
+
"customer_retention.stages.preprocessing": ["TransformerManager"],
|
|
43
|
+
"customer_retention.integrations.adapters": ["get_mlflow"],
|
|
44
|
+
"pandas": ["pd"],
|
|
45
|
+
}),
|
|
46
|
+
self.cb.section("Setup MLflow Tracking"),
|
|
47
|
+
self.cb.code(f'''mlflow_adapter = get_mlflow(tracking_uri="{tracking_uri}", force_local=True)
|
|
48
|
+
mlflow_adapter.start_run("{exp_name}", run_name="04_transformation")
|
|
49
|
+
transform_stats = {{}}'''),
|
|
50
|
+
self.cb.section("Load Silver Data"),
|
|
51
|
+
self.cb.code('''from customer_retention.integrations.adapters.factory import get_delta
|
|
52
|
+
storage = get_delta(force_local=True)
|
|
53
|
+
df = storage.read("./experiments/data/silver/customers_cleaned")
|
|
54
|
+
print(f"Loaded shape: {df.shape}")
|
|
55
|
+
mlflow_adapter.log_metric("input_rows", df.shape[0])
|
|
56
|
+
mlflow_adapter.log_metric("input_columns", df.shape[1])'''),
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
if transform_recs:
|
|
60
|
+
cells.append(self.cb.section("Transformation Recommendations from Exploration"))
|
|
61
|
+
cells.append(self.cb.code(f'''transform_recommendations = {transform_recs}
|
|
62
|
+
print(f"Found transformation recommendations for {{len(transform_recommendations)}} columns")'''))
|
|
63
|
+
|
|
64
|
+
cells.extend([
|
|
65
|
+
self.cb.section("Initialize Transformer Manager"),
|
|
66
|
+
self.cb.code(f'''numeric_cols = {numeric_cols}
|
|
67
|
+
categorical_cols = {categorical_cols}
|
|
68
|
+
|
|
69
|
+
# TransformerManager ensures consistent transformations between training and scoring
|
|
70
|
+
transformer_manager = TransformerManager(scaler_type="standard")'''),
|
|
71
|
+
self.cb.section("Fit and Transform Features"),
|
|
72
|
+
self.cb.code('''# Fit transformers and transform data in one step
|
|
73
|
+
# Exclude identifier and target columns from transformation
|
|
74
|
+
exclude_cols = ["customer_id", "target"] # Adjust based on your data
|
|
75
|
+
df = transformer_manager.fit_transform(
|
|
76
|
+
df,
|
|
77
|
+
numeric_columns=numeric_cols,
|
|
78
|
+
categorical_columns=categorical_cols,
|
|
79
|
+
exclude_columns=exclude_cols
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Log transformation statistics
|
|
83
|
+
manifest = transformer_manager.manifest
|
|
84
|
+
transform_stats["numeric_cols_scaled"] = len(manifest.numeric_columns)
|
|
85
|
+
transform_stats["categorical_cols_encoded"] = len(manifest.categorical_columns)
|
|
86
|
+
|
|
87
|
+
mlflow_adapter.log_params({
|
|
88
|
+
"scaler_type": manifest.scaler_type,
|
|
89
|
+
"encoder_type": manifest.encoder_type,
|
|
90
|
+
"scaled_columns": str(manifest.numeric_columns)[:250],
|
|
91
|
+
"encoded_columns": str(manifest.categorical_columns)[:250],
|
|
92
|
+
})
|
|
93
|
+
print(f"Scaled {len(manifest.numeric_columns)} numeric columns")
|
|
94
|
+
print(f"Encoded {len(manifest.categorical_columns)} categorical columns")'''),
|
|
95
|
+
self.cb.section("Save Transformers as Artifacts"),
|
|
96
|
+
self.cb.code('''# Save transformers locally and to MLflow
|
|
97
|
+
transformer_manager.save("./experiments/data/transformers/transformers.joblib")
|
|
98
|
+
|
|
99
|
+
# Log to MLflow for scoring pipeline to retrieve
|
|
100
|
+
import mlflow
|
|
101
|
+
transformer_manager.log_to_mlflow(run_id=mlflow.active_run().info.run_id)
|
|
102
|
+
print("Transformers saved locally and logged to MLflow")
|
|
103
|
+
print("Scoring pipeline will use these same transformers for consistency")'''),
|
|
104
|
+
self.cb.section("Log Transformation Statistics"),
|
|
105
|
+
self.cb.code('''mlflow_adapter.log_metrics({
|
|
106
|
+
"output_rows": df.shape[0],
|
|
107
|
+
"output_columns": df.shape[1],
|
|
108
|
+
**{k: v for k, v in transform_stats.items() if isinstance(v, (int, float))}
|
|
109
|
+
})
|
|
110
|
+
print(f"Logged {len(transform_stats)} transformation statistics")'''),
|
|
111
|
+
self.cb.section("Save Transformed Data"),
|
|
112
|
+
self.cb.code('''storage.write(df, "./experiments/data/silver/customers_transformed")
|
|
113
|
+
mlflow_adapter.end_run()
|
|
114
|
+
print(f"Transformed data saved: {df.shape}")'''),
|
|
115
|
+
])
|
|
116
|
+
return cells
|
|
117
|
+
|
|
118
|
+
def generate_databricks_cells(self) -> List[nbformat.NotebookNode]:
|
|
119
|
+
catalog = self.config.feature_store.catalog
|
|
120
|
+
schema = self.config.feature_store.schema
|
|
121
|
+
exp_name = self.config.mlflow.experiment_name
|
|
122
|
+
numeric_cols = self.get_numeric_columns()
|
|
123
|
+
categorical_cols = self.get_categorical_columns()
|
|
124
|
+
return self.header_cells() + [
|
|
125
|
+
self.cb.section("Setup MLflow Tracking"),
|
|
126
|
+
self.cb.code(f'''import mlflow
|
|
127
|
+
|
|
128
|
+
mlflow.set_experiment("/Users/{{spark.conf.get('spark.databricks.notebook.username', 'default')}}/{exp_name}")
|
|
129
|
+
mlflow.start_run(run_name="04_transformation")'''),
|
|
130
|
+
self.cb.section("Load Silver Data"),
|
|
131
|
+
self.cb.code(f'''df = spark.table("{catalog}.{schema}.silver_customers")
|
|
132
|
+
input_count = df.count()
|
|
133
|
+
mlflow.log_metric("input_rows", input_count)'''),
|
|
134
|
+
self.cb.section("Scale Numeric Features"),
|
|
135
|
+
self.cb.code(f'''from pyspark.ml.feature import StandardScaler, VectorAssembler
|
|
136
|
+
|
|
137
|
+
numeric_cols = {numeric_cols}
|
|
138
|
+
if numeric_cols:
|
|
139
|
+
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="numeric_features")
|
|
140
|
+
df = assembler.transform(df)
|
|
141
|
+
scaler = StandardScaler(inputCol="numeric_features", outputCol="scaled_features", withStd=True, withMean=True)
|
|
142
|
+
scaler_model = scaler.fit(df)
|
|
143
|
+
df = scaler_model.transform(df)
|
|
144
|
+
mlflow.log_params({{"scaler_type": "standard", "scaled_columns_count": len(numeric_cols)}})
|
|
145
|
+
print("Numeric features scaled")'''),
|
|
146
|
+
self.cb.section("Encode Categorical Features"),
|
|
147
|
+
self.cb.code(f'''from pyspark.ml.feature import StringIndexer
|
|
148
|
+
|
|
149
|
+
categorical_cols = {categorical_cols}
|
|
150
|
+
for col_name in categorical_cols:
|
|
151
|
+
indexer = StringIndexer(inputCol=col_name, outputCol=f"{{col_name}}_idx")
|
|
152
|
+
df = indexer.fit(df).transform(df)
|
|
153
|
+
mlflow.log_params({{"encoder_type": "string_indexer", "encoded_columns_count": len(categorical_cols)}})
|
|
154
|
+
print(f"Encoded {{len(categorical_cols)}} categorical columns")'''),
|
|
155
|
+
self.cb.section("Log Statistics"),
|
|
156
|
+
self.cb.code('''output_count = df.count()
|
|
157
|
+
mlflow.log_metrics({
|
|
158
|
+
"output_rows": output_count,
|
|
159
|
+
"columns_after_transform": len(df.columns),
|
|
160
|
+
})'''),
|
|
161
|
+
self.cb.section("Save Transformed Data"),
|
|
162
|
+
self.cb.code(f'''df.write.format("delta").mode("overwrite").saveAsTable("{catalog}.{schema}.silver_transformed")
|
|
163
|
+
mlflow.end_run()
|
|
164
|
+
print("Transformed data saved")'''),
|
|
165
|
+
]
|