churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .consistency import ConsistencyNormalizeRecommendation
|
|
2
|
+
from .deduplicate import DeduplicateRecommendation
|
|
3
|
+
from .impute import ImputeRecommendation
|
|
4
|
+
from .outlier import OutlierCapRecommendation
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"ImputeRecommendation",
|
|
8
|
+
"OutlierCapRecommendation",
|
|
9
|
+
"DeduplicateRecommendation",
|
|
10
|
+
"ConsistencyNormalizeRecommendation",
|
|
11
|
+
]
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from ..base import CleaningRecommendation, RecommendationResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ConsistencyNormalizeRecommendation(CleaningRecommendation):
|
|
10
|
+
def __init__(
|
|
11
|
+
self, columns: List[str], rationale: str = None, normalization: str = "lowercase",
|
|
12
|
+
evidence: List[str] = None, priority: str = "medium", source_finding: Optional[Any] = None
|
|
13
|
+
):
|
|
14
|
+
rationale = rationale or f"Normalize values using {normalization}"
|
|
15
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
16
|
+
self.normalization = normalization
|
|
17
|
+
self._unique_before: Dict[str, int] = {}
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def recommendation_type(self) -> str:
|
|
21
|
+
return f"normalize_{self.normalization}"
|
|
22
|
+
|
|
23
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
24
|
+
variants = {}
|
|
25
|
+
unique_before = {}
|
|
26
|
+
for col in self.columns:
|
|
27
|
+
if col not in df.columns:
|
|
28
|
+
continue
|
|
29
|
+
unique_before[col] = df[col].nunique()
|
|
30
|
+
if df[col].dtype == object:
|
|
31
|
+
variants[col] = df[col].dropna().unique().tolist()[:20]
|
|
32
|
+
self._fit_params["variants"] = variants
|
|
33
|
+
self._fit_params["unique_before"] = unique_before
|
|
34
|
+
self._unique_before = unique_before
|
|
35
|
+
|
|
36
|
+
def _normalize_series(self, series: pd.Series) -> pd.Series:
|
|
37
|
+
if series.dtype != object:
|
|
38
|
+
return series
|
|
39
|
+
if self.normalization == "lowercase":
|
|
40
|
+
return series.str.lower()
|
|
41
|
+
if self.normalization == "uppercase":
|
|
42
|
+
return series.str.upper()
|
|
43
|
+
if self.normalization == "titlecase":
|
|
44
|
+
return series.str.title()
|
|
45
|
+
if self.normalization == "strip_whitespace":
|
|
46
|
+
return series.str.strip()
|
|
47
|
+
if self.normalization == "collapse_whitespace":
|
|
48
|
+
return series.apply(lambda x: re.sub(r'\s+', ' ', x) if isinstance(x, str) else x)
|
|
49
|
+
return series
|
|
50
|
+
|
|
51
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
52
|
+
df = df.copy()
|
|
53
|
+
rows_before = len(df)
|
|
54
|
+
values_changed = {}
|
|
55
|
+
unique_after = {}
|
|
56
|
+
for col in self.columns:
|
|
57
|
+
if col not in df.columns:
|
|
58
|
+
continue
|
|
59
|
+
original = df[col].copy()
|
|
60
|
+
df[col] = self._normalize_series(df[col])
|
|
61
|
+
changed = (original != df[col]) & original.notna()
|
|
62
|
+
values_changed[col] = int(changed.sum())
|
|
63
|
+
unique_after[col] = df[col].nunique()
|
|
64
|
+
return RecommendationResult(
|
|
65
|
+
data=df, columns_affected=self.columns, rows_before=rows_before,
|
|
66
|
+
rows_after=len(df), metadata={
|
|
67
|
+
"values_changed": values_changed, "unique_after": unique_after,
|
|
68
|
+
"unique_before": self._unique_before
|
|
69
|
+
}
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
73
|
+
from customer_retention.core.compat import is_spark_available
|
|
74
|
+
if not is_spark_available():
|
|
75
|
+
return self._transform_local(df)
|
|
76
|
+
return self._transform_local(df)
|
|
77
|
+
|
|
78
|
+
def _generate_local_code(self) -> str:
|
|
79
|
+
lines = [f"# Normalize: {self.rationale}"]
|
|
80
|
+
method_map = {
|
|
81
|
+
"lowercase": "str.lower()",
|
|
82
|
+
"uppercase": "str.upper()",
|
|
83
|
+
"titlecase": "str.title()",
|
|
84
|
+
"strip_whitespace": "str.strip()",
|
|
85
|
+
"collapse_whitespace": "apply(lambda x: re.sub(r'\\s+', ' ', x) if isinstance(x, str) else x)",
|
|
86
|
+
}
|
|
87
|
+
method = method_map.get(self.normalization, "str.lower()")
|
|
88
|
+
for col in self.columns:
|
|
89
|
+
lines.append(f"df['{col}'] = df['{col}'].{method}")
|
|
90
|
+
return "\n".join(lines)
|
|
91
|
+
|
|
92
|
+
def _generate_databricks_code(self) -> str:
|
|
93
|
+
func_map = {
|
|
94
|
+
"lowercase": "lower",
|
|
95
|
+
"uppercase": "upper",
|
|
96
|
+
"strip_whitespace": "trim",
|
|
97
|
+
"titlecase": "initcap",
|
|
98
|
+
"collapse_whitespace": "regexp_replace",
|
|
99
|
+
}
|
|
100
|
+
func = func_map.get(self.normalization, "lower")
|
|
101
|
+
lines = [f"# Normalize: {self.rationale}", f"from pyspark.sql.functions import {func}, col"]
|
|
102
|
+
for col in self.columns:
|
|
103
|
+
if self.normalization == "collapse_whitespace":
|
|
104
|
+
lines.append(f"df = df.withColumn('{col}', regexp_replace(col('{col}'), r'\\s+', ' '))")
|
|
105
|
+
else:
|
|
106
|
+
lines.append(f"df = df.withColumn('{col}', {func}(col('{col}')))")
|
|
107
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from typing import Any, List, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from ..base import CleaningRecommendation, RecommendationResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DeduplicateRecommendation(CleaningRecommendation):
|
|
9
|
+
def __init__(
|
|
10
|
+
self, key_columns: List[str], rationale: str = None, strategy: str = "keep_first",
|
|
11
|
+
timestamp_column: Optional[str] = None, evidence: List[str] = None,
|
|
12
|
+
priority: str = "medium", source_finding: Optional[Any] = None
|
|
13
|
+
):
|
|
14
|
+
rationale = rationale or f"Remove duplicate rows using {strategy}"
|
|
15
|
+
super().__init__(key_columns, rationale, evidence, priority, source_finding)
|
|
16
|
+
self.key_columns = key_columns
|
|
17
|
+
self.strategy = strategy
|
|
18
|
+
self.timestamp_column = timestamp_column
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def recommendation_type(self) -> str:
|
|
22
|
+
return f"deduplicate_{self.strategy}"
|
|
23
|
+
|
|
24
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
25
|
+
existing_keys = [k for k in self.key_columns if k in df.columns]
|
|
26
|
+
if not existing_keys:
|
|
27
|
+
self._fit_params["duplicate_count"] = 0
|
|
28
|
+
self._fit_params["duplicate_keys"] = []
|
|
29
|
+
return
|
|
30
|
+
duplicated_mask = df.duplicated(subset=existing_keys, keep=False)
|
|
31
|
+
duplicated_df = df[duplicated_mask]
|
|
32
|
+
dup_count = len(duplicated_df) - duplicated_df.drop_duplicates(subset=existing_keys).shape[0]
|
|
33
|
+
self._fit_params["duplicate_count"] = dup_count
|
|
34
|
+
first_key = existing_keys[0]
|
|
35
|
+
self._fit_params["duplicate_keys"] = duplicated_df[first_key].unique().tolist()
|
|
36
|
+
|
|
37
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
38
|
+
df = df.copy()
|
|
39
|
+
rows_before = len(df)
|
|
40
|
+
existing_keys = [k for k in self.key_columns if k in df.columns]
|
|
41
|
+
if not existing_keys:
|
|
42
|
+
return RecommendationResult(
|
|
43
|
+
data=df, columns_affected=self.key_columns, rows_before=rows_before,
|
|
44
|
+
rows_after=rows_before, metadata={"duplicates_removed": 0}
|
|
45
|
+
)
|
|
46
|
+
if self.strategy == "keep_first":
|
|
47
|
+
df = df.drop_duplicates(subset=existing_keys, keep="first")
|
|
48
|
+
elif self.strategy == "keep_last":
|
|
49
|
+
df = df.drop_duplicates(subset=existing_keys, keep="last")
|
|
50
|
+
elif self.strategy == "keep_most_recent" and self.timestamp_column:
|
|
51
|
+
df = df.sort_values(self.timestamp_column, ascending=False)
|
|
52
|
+
df = df.drop_duplicates(subset=existing_keys, keep="first")
|
|
53
|
+
df = df.sort_index()
|
|
54
|
+
elif self.strategy == "drop_exact":
|
|
55
|
+
df = df.drop_duplicates(subset=existing_keys, keep="first")
|
|
56
|
+
rows_after = len(df)
|
|
57
|
+
return RecommendationResult(
|
|
58
|
+
data=df, columns_affected=self.key_columns, rows_before=rows_before,
|
|
59
|
+
rows_after=rows_after, metadata={"duplicates_removed": rows_before - rows_after}
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
63
|
+
from customer_retention.core.compat import is_spark_available
|
|
64
|
+
if not is_spark_available():
|
|
65
|
+
return self._transform_local(df)
|
|
66
|
+
return self._transform_local(df)
|
|
67
|
+
|
|
68
|
+
def _generate_local_code(self) -> str:
|
|
69
|
+
key_str = ", ".join(f"'{k}'" for k in self.key_columns)
|
|
70
|
+
lines = [f"# Deduplicate: {self.rationale}"]
|
|
71
|
+
if self.strategy == "keep_first":
|
|
72
|
+
lines.append(f"df = df.drop_duplicates(subset=[{key_str}], keep='first')")
|
|
73
|
+
elif self.strategy == "keep_last":
|
|
74
|
+
lines.append(f"df = df.drop_duplicates(subset=[{key_str}], keep='last')")
|
|
75
|
+
elif self.strategy == "keep_most_recent" and self.timestamp_column:
|
|
76
|
+
lines.append(f"df = df.sort_values('{self.timestamp_column}', ascending=False)")
|
|
77
|
+
lines.append(f"df = df.drop_duplicates(subset=[{key_str}], keep='first')")
|
|
78
|
+
lines.append("df = df.sort_index()")
|
|
79
|
+
elif self.strategy == "drop_exact":
|
|
80
|
+
lines.append(f"df = df.drop_duplicates(subset=[{key_str}], keep='first')")
|
|
81
|
+
return "\n".join(lines)
|
|
82
|
+
|
|
83
|
+
def _generate_databricks_code(self) -> str:
|
|
84
|
+
key_str = ", ".join(f"'{k}'" for k in self.key_columns)
|
|
85
|
+
lines = [f"# Deduplicate: {self.rationale}"]
|
|
86
|
+
if self.strategy == "keep_most_recent" and self.timestamp_column:
|
|
87
|
+
lines.append("from pyspark.sql.window import Window")
|
|
88
|
+
lines.append("from pyspark.sql.functions import row_number, desc")
|
|
89
|
+
lines.append(f"window = Window.partitionBy([{key_str}]).orderBy(desc('{self.timestamp_column}'))")
|
|
90
|
+
lines.append("df = df.withColumn('_row_num', row_number().over(window))")
|
|
91
|
+
lines.append("df = df.filter(df._row_num == 1).drop('_row_num')")
|
|
92
|
+
else:
|
|
93
|
+
lines.append(f"df = df.dropDuplicates([{key_str}])")
|
|
94
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from ..base import CleaningRecommendation, RecommendationResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ImputeRecommendation(CleaningRecommendation):
|
|
9
|
+
def __init__(
|
|
10
|
+
self, columns: List[str], rationale: str = None, strategy: str = "median",
|
|
11
|
+
fill_value: Any = None, evidence: List[str] = None,
|
|
12
|
+
priority: str = "medium", source_finding: Optional[Any] = None
|
|
13
|
+
):
|
|
14
|
+
rationale = rationale or f"Impute missing values using {strategy}"
|
|
15
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
16
|
+
self.strategy = strategy
|
|
17
|
+
self.fill_value = fill_value
|
|
18
|
+
self._impute_values: Dict[str, Any] = {}
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def recommendation_type(self) -> str:
|
|
22
|
+
return f"impute_{self.strategy}"
|
|
23
|
+
|
|
24
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
25
|
+
for col in self.columns:
|
|
26
|
+
if col not in df.columns:
|
|
27
|
+
continue
|
|
28
|
+
series = df[col]
|
|
29
|
+
if self.strategy == "median":
|
|
30
|
+
self._impute_values[col] = series.median()
|
|
31
|
+
elif self.strategy == "mean":
|
|
32
|
+
self._impute_values[col] = series.mean()
|
|
33
|
+
elif self.strategy == "mode":
|
|
34
|
+
modes = series.mode()
|
|
35
|
+
self._impute_values[col] = modes.iloc[0] if len(modes) > 0 else None
|
|
36
|
+
elif self.strategy == "constant":
|
|
37
|
+
self._impute_values[col] = self.fill_value
|
|
38
|
+
self._fit_params["impute_values"] = self._impute_values
|
|
39
|
+
|
|
40
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
41
|
+
df = df.copy()
|
|
42
|
+
rows_before = len(df)
|
|
43
|
+
nulls_imputed = {}
|
|
44
|
+
for col in self.columns:
|
|
45
|
+
if col in df.columns and col in self._impute_values:
|
|
46
|
+
nulls = int(df[col].isna().sum())
|
|
47
|
+
df[col] = df[col].fillna(self._impute_values[col])
|
|
48
|
+
nulls_imputed[col] = nulls
|
|
49
|
+
return RecommendationResult(
|
|
50
|
+
data=df, columns_affected=self.columns, rows_before=rows_before,
|
|
51
|
+
rows_after=len(df), metadata={"nulls_imputed": nulls_imputed}
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
55
|
+
from customer_retention.core.compat import is_spark_available
|
|
56
|
+
if not is_spark_available():
|
|
57
|
+
return self._transform_local(df)
|
|
58
|
+
return self._transform_local(df)
|
|
59
|
+
|
|
60
|
+
def _generate_local_code(self) -> str:
|
|
61
|
+
lines = [f"# Impute: {self.rationale}"]
|
|
62
|
+
for col, val in self._impute_values.items():
|
|
63
|
+
lines.append(f"df['{col}'] = df['{col}'].fillna({repr(val)})")
|
|
64
|
+
return "\n".join(lines)
|
|
65
|
+
|
|
66
|
+
def _generate_databricks_code(self) -> str:
|
|
67
|
+
return f"# Impute: {self.rationale}\ndf = df.fillna({self._impute_values})"
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from ..base import CleaningRecommendation, RecommendationResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OutlierCapRecommendation(CleaningRecommendation):
|
|
9
|
+
def __init__(
|
|
10
|
+
self, columns: List[str], rationale: str = None, percentile: int = 99,
|
|
11
|
+
evidence: List[str] = None, priority: str = "medium", source_finding: Optional[Any] = None
|
|
12
|
+
):
|
|
13
|
+
rationale = rationale or f"Cap outliers at {percentile}th percentile"
|
|
14
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
15
|
+
self.percentile = percentile
|
|
16
|
+
self._bounds: Dict[str, Dict[str, float]] = {}
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def recommendation_type(self) -> str:
|
|
20
|
+
return f"cap_outliers_{self.percentile}"
|
|
21
|
+
|
|
22
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
23
|
+
lower_pct = (100 - self.percentile) / 100
|
|
24
|
+
upper_pct = self.percentile / 100
|
|
25
|
+
for col in self.columns:
|
|
26
|
+
if col not in df.columns:
|
|
27
|
+
continue
|
|
28
|
+
series = df[col].dropna()
|
|
29
|
+
self._bounds[col] = {
|
|
30
|
+
"lower": float(series.quantile(lower_pct)),
|
|
31
|
+
"upper": float(series.quantile(upper_pct)),
|
|
32
|
+
}
|
|
33
|
+
self._fit_params["bounds"] = self._bounds
|
|
34
|
+
|
|
35
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
36
|
+
df = df.copy()
|
|
37
|
+
rows_before = len(df)
|
|
38
|
+
outliers_capped = {}
|
|
39
|
+
for col in self.columns:
|
|
40
|
+
if col in df.columns and col in self._bounds:
|
|
41
|
+
bounds = self._bounds[col]
|
|
42
|
+
outlier_mask = (df[col] < bounds["lower"]) | (df[col] > bounds["upper"])
|
|
43
|
+
outliers_capped[col] = int(outlier_mask.sum())
|
|
44
|
+
df[col] = df[col].clip(lower=bounds["lower"], upper=bounds["upper"])
|
|
45
|
+
return RecommendationResult(
|
|
46
|
+
data=df, columns_affected=self.columns, rows_before=rows_before,
|
|
47
|
+
rows_after=len(df), metadata={"outliers_capped": outliers_capped, "bounds": self._bounds}
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
51
|
+
from customer_retention.core.compat import is_spark_available
|
|
52
|
+
if not is_spark_available():
|
|
53
|
+
return self._transform_local(df)
|
|
54
|
+
return self._transform_local(df)
|
|
55
|
+
|
|
56
|
+
def _generate_local_code(self) -> str:
|
|
57
|
+
lines = [f"# Cap outliers: {self.rationale}"]
|
|
58
|
+
for col, bounds in self._bounds.items():
|
|
59
|
+
lines.append(f"df['{col}'] = df['{col}'].clip(lower={bounds['lower']}, upper={bounds['upper']})")
|
|
60
|
+
return "\n".join(lines)
|
|
61
|
+
|
|
62
|
+
def _generate_databricks_code(self) -> str:
|
|
63
|
+
lines = [f"# Cap outliers: {self.rationale}", "from pyspark.sql.functions import when, col"]
|
|
64
|
+
for col, bounds in self._bounds.items():
|
|
65
|
+
lines.append(
|
|
66
|
+
f"df = df.withColumn('{col}', "
|
|
67
|
+
f"when(col('{col}') < {bounds['lower']}, {bounds['lower']})"
|
|
68
|
+
f".when(col('{col}') > {bounds['upper']}, {bounds['upper']})"
|
|
69
|
+
f".otherwise(col('{col}')))"
|
|
70
|
+
)
|
|
71
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Any, List, Optional
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from ..base import DatetimeRecommendation, RecommendationResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ExtractMonthRecommendation(DatetimeRecommendation):
|
|
10
|
+
def __init__(
|
|
11
|
+
self, columns: List[str], rationale: str = None, evidence: List[str] = None,
|
|
12
|
+
priority: str = "medium", source_finding: Optional[Any] = None
|
|
13
|
+
):
|
|
14
|
+
rationale = rationale or "Extract month from datetime for seasonality analysis"
|
|
15
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def recommendation_type(self) -> str:
|
|
19
|
+
return "extract_month"
|
|
20
|
+
|
|
21
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
22
|
+
self._fit_params["columns"] = self.columns
|
|
23
|
+
|
|
24
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
25
|
+
df = df.copy()
|
|
26
|
+
new_cols = []
|
|
27
|
+
for col in self.columns:
|
|
28
|
+
if col in df.columns:
|
|
29
|
+
new_col = f"{col}_month"
|
|
30
|
+
df[new_col] = pd.to_datetime(df[col]).dt.month
|
|
31
|
+
new_cols.append(new_col)
|
|
32
|
+
return RecommendationResult(
|
|
33
|
+
data=df, columns_affected=self.columns + new_cols,
|
|
34
|
+
rows_before=len(df), rows_after=len(df), metadata={"new_columns": new_cols}
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
38
|
+
from customer_retention.core.compat import is_spark_available
|
|
39
|
+
if not is_spark_available():
|
|
40
|
+
return self._transform_local(df)
|
|
41
|
+
return self._transform_local(df)
|
|
42
|
+
|
|
43
|
+
def _generate_local_code(self) -> str:
|
|
44
|
+
lines = [f"# Extract: {self.rationale}"]
|
|
45
|
+
for col in self.columns:
|
|
46
|
+
lines.append(f"df['{col}_month'] = pd.to_datetime(df['{col}']).dt.month")
|
|
47
|
+
return "\n".join(lines)
|
|
48
|
+
|
|
49
|
+
def _generate_databricks_code(self) -> str:
|
|
50
|
+
lines = [f"# Extract: {self.rationale}", "from pyspark.sql.functions import month, col"]
|
|
51
|
+
for col in self.columns:
|
|
52
|
+
lines.append(f"df = df.withColumn('{col}_month', month(col('{col}')))")
|
|
53
|
+
return "\n".join(lines)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ExtractDayOfWeekRecommendation(DatetimeRecommendation):
|
|
57
|
+
def __init__(
|
|
58
|
+
self, columns: List[str], rationale: str = None, evidence: List[str] = None,
|
|
59
|
+
priority: str = "medium", source_finding: Optional[Any] = None
|
|
60
|
+
):
|
|
61
|
+
rationale = rationale or "Extract day of week from datetime for weekly patterns"
|
|
62
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def recommendation_type(self) -> str:
|
|
66
|
+
return "extract_dayofweek"
|
|
67
|
+
|
|
68
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
69
|
+
self._fit_params["columns"] = self.columns
|
|
70
|
+
|
|
71
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
72
|
+
df = df.copy()
|
|
73
|
+
new_cols = []
|
|
74
|
+
for col in self.columns:
|
|
75
|
+
if col in df.columns:
|
|
76
|
+
new_col = f"{col}_dayofweek"
|
|
77
|
+
df[new_col] = pd.to_datetime(df[col]).dt.dayofweek
|
|
78
|
+
new_cols.append(new_col)
|
|
79
|
+
return RecommendationResult(
|
|
80
|
+
data=df, columns_affected=self.columns + new_cols,
|
|
81
|
+
rows_before=len(df), rows_after=len(df), metadata={"new_columns": new_cols}
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
85
|
+
from customer_retention.core.compat import is_spark_available
|
|
86
|
+
if not is_spark_available():
|
|
87
|
+
return self._transform_local(df)
|
|
88
|
+
return self._transform_local(df)
|
|
89
|
+
|
|
90
|
+
def _generate_local_code(self) -> str:
|
|
91
|
+
lines = [f"# Extract: {self.rationale}"]
|
|
92
|
+
for col in self.columns:
|
|
93
|
+
lines.append(f"df['{col}_dayofweek'] = pd.to_datetime(df['{col}']).dt.dayofweek")
|
|
94
|
+
return "\n".join(lines)
|
|
95
|
+
|
|
96
|
+
def _generate_databricks_code(self) -> str:
|
|
97
|
+
lines = [f"# Extract: {self.rationale}", "from pyspark.sql.functions import dayofweek, col"]
|
|
98
|
+
for col in self.columns:
|
|
99
|
+
lines.append(f"df = df.withColumn('{col}_dayofweek', dayofweek(col('{col}')) - 1)")
|
|
100
|
+
return "\n".join(lines)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class DaysSinceRecommendation(DatetimeRecommendation):
|
|
104
|
+
def __init__(
|
|
105
|
+
self, columns: List[str], rationale: str = None, reference_date: datetime = None,
|
|
106
|
+
evidence: List[str] = None, priority: str = "medium", source_finding: Optional[Any] = None
|
|
107
|
+
):
|
|
108
|
+
rationale = rationale or "Calculate days since datetime for recency features"
|
|
109
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
110
|
+
self.reference_date = reference_date or datetime.now()
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def recommendation_type(self) -> str:
|
|
114
|
+
return "days_since"
|
|
115
|
+
|
|
116
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
117
|
+
self._fit_params["reference_date"] = str(self.reference_date)
|
|
118
|
+
|
|
119
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
120
|
+
df = df.copy()
|
|
121
|
+
new_cols = []
|
|
122
|
+
for col in self.columns:
|
|
123
|
+
if col in df.columns:
|
|
124
|
+
new_col = f"{col}_days_since"
|
|
125
|
+
df[new_col] = (pd.Timestamp(self.reference_date) - pd.to_datetime(df[col])).dt.days
|
|
126
|
+
new_cols.append(new_col)
|
|
127
|
+
return RecommendationResult(
|
|
128
|
+
data=df, columns_affected=self.columns + new_cols,
|
|
129
|
+
rows_before=len(df), rows_after=len(df), metadata={"reference_date": str(self.reference_date), "new_columns": new_cols}
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
133
|
+
from customer_retention.core.compat import is_spark_available
|
|
134
|
+
if not is_spark_available():
|
|
135
|
+
return self._transform_local(df)
|
|
136
|
+
return self._transform_local(df)
|
|
137
|
+
|
|
138
|
+
def _generate_local_code(self) -> str:
|
|
139
|
+
lines = [f"# Extract: {self.rationale}", f"reference_date = pd.Timestamp('{self.reference_date}')"]
|
|
140
|
+
for col in self.columns:
|
|
141
|
+
lines.append(f"df['{col}_days_since'] = (reference_date - pd.to_datetime(df['{col}'])).dt.days")
|
|
142
|
+
return "\n".join(lines)
|
|
143
|
+
|
|
144
|
+
def _generate_databricks_code(self) -> str:
|
|
145
|
+
lines = [f"# Extract: {self.rationale}", "from pyspark.sql.functions import datediff, lit, col, to_date"]
|
|
146
|
+
lines.append(f"reference_date = '{self.reference_date.strftime('%Y-%m-%d')}'")
|
|
147
|
+
for col in self.columns:
|
|
148
|
+
lines.append(f"df = df.withColumn('{col}_days_since', datediff(lit(reference_date), to_date(col('{col}'))))")
|
|
149
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from ..base import EncodingRecommendation, RecommendationResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OneHotEncodeRecommendation(EncodingRecommendation):
|
|
9
|
+
def __init__(
|
|
10
|
+
self, columns: List[str], rationale: str = None, drop_first: bool = False,
|
|
11
|
+
evidence: List[str] = None, priority: str = "medium", source_finding: Optional[Any] = None
|
|
12
|
+
):
|
|
13
|
+
rationale = rationale or "One-hot encode categorical features"
|
|
14
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
15
|
+
self.drop_first = drop_first
|
|
16
|
+
self._categories: Dict[str, List[str]] = {}
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def recommendation_type(self) -> str:
|
|
20
|
+
return "onehot_encode"
|
|
21
|
+
|
|
22
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
23
|
+
for col in self.columns:
|
|
24
|
+
if col in df.columns:
|
|
25
|
+
self._categories[col] = list(df[col].dropna().unique())
|
|
26
|
+
self._fit_params["categories"] = self._categories
|
|
27
|
+
|
|
28
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
29
|
+
df = df.copy()
|
|
30
|
+
rows_before = len(df)
|
|
31
|
+
new_cols = []
|
|
32
|
+
for col in self.columns:
|
|
33
|
+
if col in df.columns:
|
|
34
|
+
dummies = pd.get_dummies(df[col], prefix=col, drop_first=self.drop_first)
|
|
35
|
+
new_cols.extend(dummies.columns.tolist())
|
|
36
|
+
df = pd.concat([df, dummies], axis=1)
|
|
37
|
+
df = df.drop(columns=[col])
|
|
38
|
+
return RecommendationResult(
|
|
39
|
+
data=df, columns_affected=self.columns + new_cols, rows_before=rows_before,
|
|
40
|
+
rows_after=len(df), metadata={"categories": self._categories, "new_columns": new_cols}
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
44
|
+
from customer_retention.core.compat import is_spark_available
|
|
45
|
+
if not is_spark_available():
|
|
46
|
+
return self._transform_local(df)
|
|
47
|
+
return self._transform_local(df)
|
|
48
|
+
|
|
49
|
+
def _generate_local_code(self) -> str:
|
|
50
|
+
lines = [f"# Encode: {self.rationale}"]
|
|
51
|
+
for col in self.columns:
|
|
52
|
+
lines.append(f"df = pd.concat([df, pd.get_dummies(df['{col}'], prefix='{col}')], axis=1).drop(columns=['{col}'])")
|
|
53
|
+
return "\n".join(lines)
|
|
54
|
+
|
|
55
|
+
def _generate_databricks_code(self) -> str:
|
|
56
|
+
lines = [f"# Encode: {self.rationale}", "from pyspark.ml.feature import StringIndexer, OneHotEncoder"]
|
|
57
|
+
for col in self.columns:
|
|
58
|
+
lines.append(f"indexer = StringIndexer(inputCol='{col}', outputCol='{col}_idx')")
|
|
59
|
+
lines.append(f"encoder = OneHotEncoder(inputCol='{col}_idx', outputCol='{col}_vec')")
|
|
60
|
+
lines.append("df = encoder.fit(indexer.fit(df).transform(df)).transform(indexer.fit(df).transform(df))")
|
|
61
|
+
return "\n".join(lines)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class LabelEncodeRecommendation(EncodingRecommendation):
|
|
65
|
+
def __init__(
|
|
66
|
+
self, columns: List[str], rationale: str = None, evidence: List[str] = None,
|
|
67
|
+
priority: str = "medium", source_finding: Optional[Any] = None
|
|
68
|
+
):
|
|
69
|
+
rationale = rationale or "Label encode categorical features to integers"
|
|
70
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
71
|
+
self._mappings: Dict[str, Dict[str, int]] = {}
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def recommendation_type(self) -> str:
|
|
75
|
+
return "label_encode"
|
|
76
|
+
|
|
77
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
78
|
+
for col in self.columns:
|
|
79
|
+
if col in df.columns:
|
|
80
|
+
categories = sorted(df[col].dropna().unique())
|
|
81
|
+
self._mappings[col] = {cat: idx for idx, cat in enumerate(categories)}
|
|
82
|
+
self._fit_params["mappings"] = self._mappings
|
|
83
|
+
|
|
84
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
85
|
+
df = df.copy()
|
|
86
|
+
rows_before = len(df)
|
|
87
|
+
for col in self.columns:
|
|
88
|
+
if col in df.columns and col in self._mappings:
|
|
89
|
+
df[col] = df[col].map(self._mappings[col])
|
|
90
|
+
return RecommendationResult(
|
|
91
|
+
data=df, columns_affected=self.columns, rows_before=rows_before,
|
|
92
|
+
rows_after=len(df), metadata={"mappings": self._mappings}
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
96
|
+
from customer_retention.core.compat import is_spark_available
|
|
97
|
+
if not is_spark_available():
|
|
98
|
+
return self._transform_local(df)
|
|
99
|
+
return self._transform_local(df)
|
|
100
|
+
|
|
101
|
+
def _generate_local_code(self) -> str:
|
|
102
|
+
lines = [f"# Encode: {self.rationale}", "from sklearn.preprocessing import LabelEncoder"]
|
|
103
|
+
for col in self.columns:
|
|
104
|
+
lines.append(f"le_{col} = LabelEncoder()")
|
|
105
|
+
lines.append(f"df['{col}'] = le_{col}.fit_transform(df['{col}'].astype(str))")
|
|
106
|
+
return "\n".join(lines)
|
|
107
|
+
|
|
108
|
+
def _generate_databricks_code(self) -> str:
|
|
109
|
+
lines = [f"# Encode: {self.rationale}", "from pyspark.ml.feature import StringIndexer"]
|
|
110
|
+
for col in self.columns:
|
|
111
|
+
lines.append(f"indexer = StringIndexer(inputCol='{col}', outputCol='{col}_idx')")
|
|
112
|
+
lines.append("df = indexer.fit(df).transform(df)")
|
|
113
|
+
lines.append(f"df = df.drop('{col}').withColumnRenamed('{col}_idx', '{col}')")
|
|
114
|
+
return "\n".join(lines)
|