churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from .base import BaseRecommendation, Platform
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from customer_retention.analysis.auto_explorer.findings import ExplorationFindings
|
|
9
|
+
from customer_retention.stages.features.feature_definitions import FeatureCatalog
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RecommendationPipeline:
|
|
13
|
+
def __init__(self, recommendations: List[BaseRecommendation] = None):
|
|
14
|
+
self.recommendations = recommendations or []
|
|
15
|
+
self._is_fitted = False
|
|
16
|
+
|
|
17
|
+
def add(self, recommendation: BaseRecommendation) -> "RecommendationPipeline":
|
|
18
|
+
self.recommendations.append(recommendation)
|
|
19
|
+
return self
|
|
20
|
+
|
|
21
|
+
def fit(self, df: pd.DataFrame) -> "RecommendationPipeline":
|
|
22
|
+
for rec in self.recommendations:
|
|
23
|
+
rec.fit(df)
|
|
24
|
+
self._is_fitted = True
|
|
25
|
+
return self
|
|
26
|
+
|
|
27
|
+
def transform(
|
|
28
|
+
self, df: pd.DataFrame, platform: Platform = Platform.LOCAL,
|
|
29
|
+
mlflow_adapter: Optional[Any] = None
|
|
30
|
+
) -> pd.DataFrame:
|
|
31
|
+
for rec in self.recommendations:
|
|
32
|
+
result = rec.transform(df, platform, mlflow_adapter=mlflow_adapter)
|
|
33
|
+
df = result.data
|
|
34
|
+
return df
|
|
35
|
+
|
|
36
|
+
def fit_transform(
|
|
37
|
+
self, df: pd.DataFrame, platform: Platform = Platform.LOCAL,
|
|
38
|
+
mlflow_adapter: Optional[Any] = None
|
|
39
|
+
) -> pd.DataFrame:
|
|
40
|
+
self.fit(df)
|
|
41
|
+
return self.transform(df, platform, mlflow_adapter=mlflow_adapter)
|
|
42
|
+
|
|
43
|
+
def generate_code(self, platform: Platform = Platform.LOCAL) -> str:
|
|
44
|
+
if not self.recommendations:
|
|
45
|
+
return ""
|
|
46
|
+
lines = []
|
|
47
|
+
for rec in self.recommendations:
|
|
48
|
+
lines.append(rec.generate_code(platform))
|
|
49
|
+
lines.append("")
|
|
50
|
+
return "\n".join(lines).strip()
|
|
51
|
+
|
|
52
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
53
|
+
return {
|
|
54
|
+
"recommendations": [rec.to_dict() for rec in self.recommendations],
|
|
55
|
+
"is_fitted": self._is_fitted,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
def to_feature_catalog(self) -> "FeatureCatalog":
|
|
59
|
+
from customer_retention.stages.features.feature_definitions import FeatureCatalog
|
|
60
|
+
catalog = FeatureCatalog()
|
|
61
|
+
for rec in self.recommendations:
|
|
62
|
+
catalog.add(rec.to_feature_definition())
|
|
63
|
+
return catalog
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def from_findings(cls, findings: "ExplorationFindings") -> "RecommendationPipeline":
|
|
67
|
+
from .registry import RecommendationRegistry
|
|
68
|
+
return cls(RecommendationRegistry.from_findings(findings))
|
|
69
|
+
|
|
70
|
+
def __len__(self) -> int:
|
|
71
|
+
return len(self.recommendations)
|
|
72
|
+
|
|
73
|
+
def __iter__(self) -> Iterator[BaseRecommendation]:
|
|
74
|
+
return iter(self.recommendations)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
from .base import BaseRecommendation
|
|
5
|
+
from .cleaning import ImputeRecommendation, OutlierCapRecommendation
|
|
6
|
+
from .datetime import DaysSinceRecommendation, ExtractDayOfWeekRecommendation, ExtractMonthRecommendation
|
|
7
|
+
from .encoding import LabelEncodeRecommendation, OneHotEncodeRecommendation
|
|
8
|
+
from .transform import LogTransformRecommendation, MinMaxScaleRecommendation, StandardScaleRecommendation
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from customer_retention.analysis.auto_explorer.findings import ColumnFinding, ExplorationFindings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RecommendationRegistry:
|
|
15
|
+
@classmethod
|
|
16
|
+
def create_cleaning(cls, rec_str: str, columns: List[str], finding: Optional["ColumnFinding"]) -> Optional[BaseRecommendation]:
|
|
17
|
+
if rec_str == "impute_median":
|
|
18
|
+
return ImputeRecommendation(columns, strategy="median", source_finding=finding)
|
|
19
|
+
if rec_str == "impute_mean":
|
|
20
|
+
return ImputeRecommendation(columns, strategy="mean", source_finding=finding)
|
|
21
|
+
if rec_str == "impute_mode":
|
|
22
|
+
return ImputeRecommendation(columns, strategy="mode", source_finding=finding)
|
|
23
|
+
if rec_str == "impute_zero":
|
|
24
|
+
return ImputeRecommendation(columns, strategy="constant", fill_value=0, source_finding=finding)
|
|
25
|
+
if match := re.match(r"cap_outliers_(\d+)", rec_str):
|
|
26
|
+
return OutlierCapRecommendation(columns, percentile=int(match.group(1)), source_finding=finding)
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def create_transform(cls, rec_str: str, columns: List[str], finding: Optional["ColumnFinding"]) -> Optional[BaseRecommendation]:
|
|
31
|
+
if rec_str == "standard_scale":
|
|
32
|
+
return StandardScaleRecommendation(columns, source_finding=finding)
|
|
33
|
+
if rec_str == "minmax_scale":
|
|
34
|
+
return MinMaxScaleRecommendation(columns, source_finding=finding)
|
|
35
|
+
if rec_str == "log_transform":
|
|
36
|
+
return LogTransformRecommendation(columns, source_finding=finding)
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def create_encoding(cls, rec_str: str, columns: List[str], finding: Optional["ColumnFinding"]) -> Optional[BaseRecommendation]:
|
|
41
|
+
if rec_str == "onehot_encode":
|
|
42
|
+
return OneHotEncodeRecommendation(columns, source_finding=finding)
|
|
43
|
+
if rec_str == "label_encode":
|
|
44
|
+
return LabelEncodeRecommendation(columns, source_finding=finding)
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def create_datetime(cls, rec_str: str, columns: List[str], finding: Optional["ColumnFinding"]) -> Optional[BaseRecommendation]:
|
|
49
|
+
if rec_str == "extract_month":
|
|
50
|
+
return ExtractMonthRecommendation(columns, source_finding=finding)
|
|
51
|
+
if rec_str == "extract_dayofweek":
|
|
52
|
+
return ExtractDayOfWeekRecommendation(columns, source_finding=finding)
|
|
53
|
+
if rec_str == "days_since":
|
|
54
|
+
return DaysSinceRecommendation(columns, source_finding=finding)
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def from_findings(cls, findings: "ExplorationFindings") -> List[BaseRecommendation]:
|
|
59
|
+
from customer_retention.core.config.column_config import ColumnType
|
|
60
|
+
recommendations = []
|
|
61
|
+
for col_name, col_finding in findings.columns.items():
|
|
62
|
+
if col_finding.inferred_type in (ColumnType.IDENTIFIER, ColumnType.TARGET):
|
|
63
|
+
continue
|
|
64
|
+
cleaning_recs = getattr(col_finding, "cleaning_recommendations", []) or []
|
|
65
|
+
for rec_str in cleaning_recs:
|
|
66
|
+
rec = cls.create_cleaning(rec_str, [col_name], col_finding)
|
|
67
|
+
if rec:
|
|
68
|
+
recommendations.append(rec)
|
|
69
|
+
transform_recs = getattr(col_finding, "transformation_recommendations", []) or []
|
|
70
|
+
for rec_str in transform_recs:
|
|
71
|
+
rec = cls.create_transform(rec_str, [col_name], col_finding) or \
|
|
72
|
+
cls.create_encoding(rec_str, [col_name], col_finding) or \
|
|
73
|
+
cls.create_datetime(rec_str, [col_name], col_finding)
|
|
74
|
+
if rec:
|
|
75
|
+
recommendations.append(rec)
|
|
76
|
+
return recommendations
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import Any, List, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from ..base import BaseRecommendation, RecommendationResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DropColumnRecommendation(BaseRecommendation):
|
|
9
|
+
def __init__(
|
|
10
|
+
self, columns: List[str], rationale: str = None, reason: str = "not_specified",
|
|
11
|
+
evidence: List[str] = None, priority: str = "medium", source_finding: Optional[Any] = None
|
|
12
|
+
):
|
|
13
|
+
rationale = rationale or f"Drop columns: {', '.join(columns)}"
|
|
14
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
15
|
+
self.reason = reason
|
|
16
|
+
self._columns_to_drop: List[str] = []
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def category(self) -> str:
|
|
20
|
+
return "feature_selection"
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def recommendation_type(self) -> str:
|
|
24
|
+
return f"drop_{self.reason}"
|
|
25
|
+
|
|
26
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
27
|
+
self._columns_to_drop = [c for c in self.columns if c in df.columns]
|
|
28
|
+
|
|
29
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
30
|
+
df = df.copy()
|
|
31
|
+
rows_before = len(df)
|
|
32
|
+
cols_to_drop = [c for c in self._columns_to_drop if c in df.columns]
|
|
33
|
+
if cols_to_drop:
|
|
34
|
+
df = df.drop(columns=cols_to_drop)
|
|
35
|
+
return RecommendationResult(
|
|
36
|
+
data=df, columns_affected=self._columns_to_drop, rows_before=rows_before,
|
|
37
|
+
rows_after=len(df), metadata={"dropped_columns": cols_to_drop}
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
41
|
+
from customer_retention.core.compat import is_spark_available
|
|
42
|
+
if not is_spark_available():
|
|
43
|
+
return self._transform_local(df)
|
|
44
|
+
return self._transform_local(df)
|
|
45
|
+
|
|
46
|
+
def _generate_local_code(self) -> str:
|
|
47
|
+
cols_str = ", ".join(f"'{c}'" for c in self._columns_to_drop)
|
|
48
|
+
lines = [f"# Drop columns: {self.rationale}"]
|
|
49
|
+
lines.append(f"df = df.drop(columns=[{cols_str}])")
|
|
50
|
+
return "\n".join(lines)
|
|
51
|
+
|
|
52
|
+
def _generate_databricks_code(self) -> str:
|
|
53
|
+
cols_str = ", ".join(f"'{c}'" for c in self._columns_to_drop)
|
|
54
|
+
lines = [f"# Drop columns: {self.rationale}"]
|
|
55
|
+
lines.append(f"df = df.drop([{cols_str}])")
|
|
56
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
from .power import LogTransformRecommendation, SqrtTransformRecommendation
|
|
2
|
+
from .scale import MinMaxScaleRecommendation, StandardScaleRecommendation
|
|
3
|
+
|
|
4
|
+
__all__ = ["StandardScaleRecommendation", "MinMaxScaleRecommendation", "LogTransformRecommendation", "SqrtTransformRecommendation"]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from typing import Any, List, Optional
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from ..base import RecommendationResult, TransformRecommendation
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LogTransformRecommendation(TransformRecommendation):
|
|
10
|
+
def __init__(
|
|
11
|
+
self, columns: List[str], rationale: str = None, evidence: List[str] = None,
|
|
12
|
+
priority: str = "medium", source_finding: Optional[Any] = None
|
|
13
|
+
):
|
|
14
|
+
rationale = rationale or "Apply log1p transform to reduce skewness"
|
|
15
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def recommendation_type(self) -> str:
|
|
19
|
+
return "log_transform"
|
|
20
|
+
|
|
21
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
22
|
+
self._fit_params["columns"] = self.columns
|
|
23
|
+
|
|
24
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
25
|
+
df = df.copy()
|
|
26
|
+
for col in self.columns:
|
|
27
|
+
if col in df.columns:
|
|
28
|
+
df[col] = np.log1p(df[col])
|
|
29
|
+
return RecommendationResult(
|
|
30
|
+
data=df, columns_affected=self.columns, rows_before=len(df),
|
|
31
|
+
rows_after=len(df), metadata={"transform": "log1p"}
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
35
|
+
from customer_retention.core.compat import is_spark_available
|
|
36
|
+
if not is_spark_available():
|
|
37
|
+
return self._transform_local(df)
|
|
38
|
+
return self._transform_local(df)
|
|
39
|
+
|
|
40
|
+
def _generate_local_code(self) -> str:
|
|
41
|
+
lines = [f"# Transform: {self.rationale}", "import numpy as np"]
|
|
42
|
+
for col in self.columns:
|
|
43
|
+
lines.append(f"df['{col}'] = np.log1p(df['{col}'])")
|
|
44
|
+
return "\n".join(lines)
|
|
45
|
+
|
|
46
|
+
def _generate_databricks_code(self) -> str:
|
|
47
|
+
lines = [f"# Transform: {self.rationale}", "from pyspark.sql.functions import log1p, col"]
|
|
48
|
+
for col in self.columns:
|
|
49
|
+
lines.append(f"df = df.withColumn('{col}', log1p(col('{col}')))")
|
|
50
|
+
return "\n".join(lines)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class SqrtTransformRecommendation(TransformRecommendation):
|
|
54
|
+
def __init__(
|
|
55
|
+
self, columns: List[str], rationale: str = None, evidence: List[str] = None,
|
|
56
|
+
priority: str = "medium", source_finding: Optional[Any] = None
|
|
57
|
+
):
|
|
58
|
+
rationale = rationale or "Apply sqrt transform to reduce moderate skewness"
|
|
59
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def recommendation_type(self) -> str:
|
|
63
|
+
return "sqrt_transform"
|
|
64
|
+
|
|
65
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
66
|
+
self._fit_params["columns"] = self.columns
|
|
67
|
+
|
|
68
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
69
|
+
df = df.copy()
|
|
70
|
+
for col in self.columns:
|
|
71
|
+
if col in df.columns:
|
|
72
|
+
df[col] = np.sqrt(df[col])
|
|
73
|
+
return RecommendationResult(
|
|
74
|
+
data=df, columns_affected=self.columns, rows_before=len(df),
|
|
75
|
+
rows_after=len(df), metadata={"transform": "sqrt"}
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
79
|
+
from customer_retention.core.compat import is_spark_available
|
|
80
|
+
if not is_spark_available():
|
|
81
|
+
return self._transform_local(df)
|
|
82
|
+
return self._transform_local(df)
|
|
83
|
+
|
|
84
|
+
def _generate_local_code(self) -> str:
|
|
85
|
+
lines = [f"# Transform: {self.rationale}", "import numpy as np"]
|
|
86
|
+
for col in self.columns:
|
|
87
|
+
lines.append(f"df['{col}'] = np.sqrt(df['{col}'])")
|
|
88
|
+
return "\n".join(lines)
|
|
89
|
+
|
|
90
|
+
def _generate_databricks_code(self) -> str:
|
|
91
|
+
lines = [f"# Transform: {self.rationale}", "from pyspark.sql.functions import sqrt, col"]
|
|
92
|
+
for col in self.columns:
|
|
93
|
+
lines.append(f"df = df.withColumn('{col}', sqrt(col('{col}')))")
|
|
94
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from ..base import RecommendationResult, TransformRecommendation
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class StandardScaleRecommendation(TransformRecommendation):
|
|
9
|
+
def __init__(
|
|
10
|
+
self, columns: List[str], rationale: str = None, evidence: List[str] = None,
|
|
11
|
+
priority: str = "medium", source_finding: Optional[Any] = None
|
|
12
|
+
):
|
|
13
|
+
rationale = rationale or "Standardize features to zero mean and unit variance"
|
|
14
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
15
|
+
self._means: Dict[str, float] = {}
|
|
16
|
+
self._stds: Dict[str, float] = {}
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def recommendation_type(self) -> str:
|
|
20
|
+
return "standard_scale"
|
|
21
|
+
|
|
22
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
23
|
+
for col in self.columns:
|
|
24
|
+
if col in df.columns:
|
|
25
|
+
self._means[col] = float(df[col].mean())
|
|
26
|
+
self._stds[col] = float(df[col].std(ddof=0))
|
|
27
|
+
self._fit_params = {"means": self._means, "stds": self._stds}
|
|
28
|
+
|
|
29
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
30
|
+
df = df.copy()
|
|
31
|
+
for col in self.columns:
|
|
32
|
+
if col in df.columns and col in self._means:
|
|
33
|
+
std = self._stds[col] if self._stds[col] != 0 else 1.0
|
|
34
|
+
df[col] = (df[col] - self._means[col]) / std
|
|
35
|
+
return RecommendationResult(
|
|
36
|
+
data=df, columns_affected=self.columns, rows_before=len(df),
|
|
37
|
+
rows_after=len(df), metadata={"means": self._means, "stds": self._stds}
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
41
|
+
from customer_retention.core.compat import is_spark_available
|
|
42
|
+
if not is_spark_available():
|
|
43
|
+
return self._transform_local(df)
|
|
44
|
+
return self._transform_local(df)
|
|
45
|
+
|
|
46
|
+
def _generate_local_code(self) -> str:
|
|
47
|
+
return f"""# Scale: {self.rationale}
|
|
48
|
+
from sklearn.preprocessing import StandardScaler
|
|
49
|
+
scaler = StandardScaler()
|
|
50
|
+
df[{self.columns}] = scaler.fit_transform(df[{self.columns}])"""
|
|
51
|
+
|
|
52
|
+
def _generate_databricks_code(self) -> str:
|
|
53
|
+
return f"""# Scale: {self.rationale}
|
|
54
|
+
from pyspark.ml.feature import StandardScaler, VectorAssembler
|
|
55
|
+
assembler = VectorAssembler(inputCols={self.columns}, outputCol="features")
|
|
56
|
+
scaler = StandardScaler(inputCol="features", outputCol="scaled", withStd=True, withMean=True)
|
|
57
|
+
df = scaler.fit(assembler.transform(df)).transform(assembler.transform(df))"""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class MinMaxScaleRecommendation(TransformRecommendation):
|
|
61
|
+
def __init__(
|
|
62
|
+
self, columns: List[str], rationale: str = None, feature_range: Tuple[float, float] = (0, 1),
|
|
63
|
+
evidence: List[str] = None, priority: str = "medium", source_finding: Optional[Any] = None
|
|
64
|
+
):
|
|
65
|
+
rationale = rationale or f"Scale features to range {feature_range}"
|
|
66
|
+
super().__init__(columns, rationale, evidence, priority, source_finding)
|
|
67
|
+
self.feature_range = feature_range
|
|
68
|
+
self._mins: Dict[str, float] = {}
|
|
69
|
+
self._maxs: Dict[str, float] = {}
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def recommendation_type(self) -> str:
|
|
73
|
+
return "minmax_scale"
|
|
74
|
+
|
|
75
|
+
def _fit_impl(self, df: pd.DataFrame) -> None:
|
|
76
|
+
for col in self.columns:
|
|
77
|
+
if col in df.columns:
|
|
78
|
+
self._mins[col] = float(df[col].min())
|
|
79
|
+
self._maxs[col] = float(df[col].max())
|
|
80
|
+
self._fit_params = {"mins": self._mins, "maxs": self._maxs, "feature_range": self.feature_range}
|
|
81
|
+
|
|
82
|
+
def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
|
|
83
|
+
df = df.copy()
|
|
84
|
+
min_val, max_val = self.feature_range
|
|
85
|
+
for col in self.columns:
|
|
86
|
+
if col in df.columns and col in self._mins:
|
|
87
|
+
col_min, col_max = self._mins[col], self._maxs[col]
|
|
88
|
+
scale = (max_val - min_val) / (col_max - col_min) if col_max != col_min else 1.0
|
|
89
|
+
df[col] = (df[col] - col_min) * scale + min_val
|
|
90
|
+
return RecommendationResult(
|
|
91
|
+
data=df, columns_affected=self.columns, rows_before=len(df),
|
|
92
|
+
rows_after=len(df), metadata={"mins": self._mins, "maxs": self._maxs}
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
|
|
96
|
+
from customer_retention.core.compat import is_spark_available
|
|
97
|
+
if not is_spark_available():
|
|
98
|
+
return self._transform_local(df)
|
|
99
|
+
return self._transform_local(df)
|
|
100
|
+
|
|
101
|
+
def _generate_local_code(self) -> str:
|
|
102
|
+
return f"""# Scale: {self.rationale}
|
|
103
|
+
from sklearn.preprocessing import MinMaxScaler
|
|
104
|
+
scaler = MinMaxScaler(feature_range={self.feature_range})
|
|
105
|
+
df[{self.columns}] = scaler.fit_transform(df[{self.columns}])"""
|
|
106
|
+
|
|
107
|
+
def _generate_databricks_code(self) -> str:
|
|
108
|
+
return f"""# Scale: {self.rationale}
|
|
109
|
+
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
|
|
110
|
+
assembler = VectorAssembler(inputCols={self.columns}, outputCol="features")
|
|
111
|
+
scaler = MinMaxScaler(inputCol="features", outputCol="scaled", min={self.feature_range[0]}, max={self.feature_range[1]})
|
|
112
|
+
df = scaler.fit(assembler.transform(df)).transform(assembler.transform(df))"""
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from . import console
|
|
2
|
+
from .chart_builder import ChartBuilder
|
|
3
|
+
from .display import DisplayManager, detect_environment, display_figure, display_summary, display_table
|
|
4
|
+
from .number_formatter import NumberFormatter
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"ChartBuilder",
|
|
8
|
+
"DisplayManager",
|
|
9
|
+
"NumberFormatter",
|
|
10
|
+
"detect_environment",
|
|
11
|
+
"display_figure",
|
|
12
|
+
"display_summary",
|
|
13
|
+
"display_table",
|
|
14
|
+
"console",
|
|
15
|
+
]
|