churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,11 @@
1
+ from .consistency import ConsistencyNormalizeRecommendation
2
+ from .deduplicate import DeduplicateRecommendation
3
+ from .impute import ImputeRecommendation
4
+ from .outlier import OutlierCapRecommendation
5
+
6
+ __all__ = [
7
+ "ImputeRecommendation",
8
+ "OutlierCapRecommendation",
9
+ "DeduplicateRecommendation",
10
+ "ConsistencyNormalizeRecommendation",
11
+ ]
@@ -0,0 +1,107 @@
1
+ import re
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ import pandas as pd
5
+
6
+ from ..base import CleaningRecommendation, RecommendationResult
7
+
8
+
9
+ class ConsistencyNormalizeRecommendation(CleaningRecommendation):
10
+ def __init__(
11
+ self, columns: List[str], rationale: str = None, normalization: str = "lowercase",
12
+ evidence: List[str] = None, priority: str = "medium", source_finding: Optional[Any] = None
13
+ ):
14
+ rationale = rationale or f"Normalize values using {normalization}"
15
+ super().__init__(columns, rationale, evidence, priority, source_finding)
16
+ self.normalization = normalization
17
+ self._unique_before: Dict[str, int] = {}
18
+
19
+ @property
20
+ def recommendation_type(self) -> str:
21
+ return f"normalize_{self.normalization}"
22
+
23
+ def _fit_impl(self, df: pd.DataFrame) -> None:
24
+ variants = {}
25
+ unique_before = {}
26
+ for col in self.columns:
27
+ if col not in df.columns:
28
+ continue
29
+ unique_before[col] = df[col].nunique()
30
+ if df[col].dtype == object:
31
+ variants[col] = df[col].dropna().unique().tolist()[:20]
32
+ self._fit_params["variants"] = variants
33
+ self._fit_params["unique_before"] = unique_before
34
+ self._unique_before = unique_before
35
+
36
+ def _normalize_series(self, series: pd.Series) -> pd.Series:
37
+ if series.dtype != object:
38
+ return series
39
+ if self.normalization == "lowercase":
40
+ return series.str.lower()
41
+ if self.normalization == "uppercase":
42
+ return series.str.upper()
43
+ if self.normalization == "titlecase":
44
+ return series.str.title()
45
+ if self.normalization == "strip_whitespace":
46
+ return series.str.strip()
47
+ if self.normalization == "collapse_whitespace":
48
+ return series.apply(lambda x: re.sub(r'\s+', ' ', x) if isinstance(x, str) else x)
49
+ return series
50
+
51
+ def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
52
+ df = df.copy()
53
+ rows_before = len(df)
54
+ values_changed = {}
55
+ unique_after = {}
56
+ for col in self.columns:
57
+ if col not in df.columns:
58
+ continue
59
+ original = df[col].copy()
60
+ df[col] = self._normalize_series(df[col])
61
+ changed = (original != df[col]) & original.notna()
62
+ values_changed[col] = int(changed.sum())
63
+ unique_after[col] = df[col].nunique()
64
+ return RecommendationResult(
65
+ data=df, columns_affected=self.columns, rows_before=rows_before,
66
+ rows_after=len(df), metadata={
67
+ "values_changed": values_changed, "unique_after": unique_after,
68
+ "unique_before": self._unique_before
69
+ }
70
+ )
71
+
72
+ def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
73
+ from customer_retention.core.compat import is_spark_available
74
+ if not is_spark_available():
75
+ return self._transform_local(df)
76
+ return self._transform_local(df)
77
+
78
+ def _generate_local_code(self) -> str:
79
+ lines = [f"# Normalize: {self.rationale}"]
80
+ method_map = {
81
+ "lowercase": "str.lower()",
82
+ "uppercase": "str.upper()",
83
+ "titlecase": "str.title()",
84
+ "strip_whitespace": "str.strip()",
85
+ "collapse_whitespace": "apply(lambda x: re.sub(r'\\s+', ' ', x) if isinstance(x, str) else x)",
86
+ }
87
+ method = method_map.get(self.normalization, "str.lower()")
88
+ for col in self.columns:
89
+ lines.append(f"df['{col}'] = df['{col}'].{method}")
90
+ return "\n".join(lines)
91
+
92
+ def _generate_databricks_code(self) -> str:
93
+ func_map = {
94
+ "lowercase": "lower",
95
+ "uppercase": "upper",
96
+ "strip_whitespace": "trim",
97
+ "titlecase": "initcap",
98
+ "collapse_whitespace": "regexp_replace",
99
+ }
100
+ func = func_map.get(self.normalization, "lower")
101
+ lines = [f"# Normalize: {self.rationale}", f"from pyspark.sql.functions import {func}, col"]
102
+ for col in self.columns:
103
+ if self.normalization == "collapse_whitespace":
104
+ lines.append(f"df = df.withColumn('{col}', regexp_replace(col('{col}'), r'\\s+', ' '))")
105
+ else:
106
+ lines.append(f"df = df.withColumn('{col}', {func}(col('{col}')))")
107
+ return "\n".join(lines)
@@ -0,0 +1,94 @@
1
+ from typing import Any, List, Optional
2
+
3
+ import pandas as pd
4
+
5
+ from ..base import CleaningRecommendation, RecommendationResult
6
+
7
+
8
+ class DeduplicateRecommendation(CleaningRecommendation):
9
+ def __init__(
10
+ self, key_columns: List[str], rationale: str = None, strategy: str = "keep_first",
11
+ timestamp_column: Optional[str] = None, evidence: List[str] = None,
12
+ priority: str = "medium", source_finding: Optional[Any] = None
13
+ ):
14
+ rationale = rationale or f"Remove duplicate rows using {strategy}"
15
+ super().__init__(key_columns, rationale, evidence, priority, source_finding)
16
+ self.key_columns = key_columns
17
+ self.strategy = strategy
18
+ self.timestamp_column = timestamp_column
19
+
20
+ @property
21
+ def recommendation_type(self) -> str:
22
+ return f"deduplicate_{self.strategy}"
23
+
24
+ def _fit_impl(self, df: pd.DataFrame) -> None:
25
+ existing_keys = [k for k in self.key_columns if k in df.columns]
26
+ if not existing_keys:
27
+ self._fit_params["duplicate_count"] = 0
28
+ self._fit_params["duplicate_keys"] = []
29
+ return
30
+ duplicated_mask = df.duplicated(subset=existing_keys, keep=False)
31
+ duplicated_df = df[duplicated_mask]
32
+ dup_count = len(duplicated_df) - duplicated_df.drop_duplicates(subset=existing_keys).shape[0]
33
+ self._fit_params["duplicate_count"] = dup_count
34
+ first_key = existing_keys[0]
35
+ self._fit_params["duplicate_keys"] = duplicated_df[first_key].unique().tolist()
36
+
37
+ def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
38
+ df = df.copy()
39
+ rows_before = len(df)
40
+ existing_keys = [k for k in self.key_columns if k in df.columns]
41
+ if not existing_keys:
42
+ return RecommendationResult(
43
+ data=df, columns_affected=self.key_columns, rows_before=rows_before,
44
+ rows_after=rows_before, metadata={"duplicates_removed": 0}
45
+ )
46
+ if self.strategy == "keep_first":
47
+ df = df.drop_duplicates(subset=existing_keys, keep="first")
48
+ elif self.strategy == "keep_last":
49
+ df = df.drop_duplicates(subset=existing_keys, keep="last")
50
+ elif self.strategy == "keep_most_recent" and self.timestamp_column:
51
+ df = df.sort_values(self.timestamp_column, ascending=False)
52
+ df = df.drop_duplicates(subset=existing_keys, keep="first")
53
+ df = df.sort_index()
54
+ elif self.strategy == "drop_exact":
55
+ df = df.drop_duplicates(subset=existing_keys, keep="first")
56
+ rows_after = len(df)
57
+ return RecommendationResult(
58
+ data=df, columns_affected=self.key_columns, rows_before=rows_before,
59
+ rows_after=rows_after, metadata={"duplicates_removed": rows_before - rows_after}
60
+ )
61
+
62
+ def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
63
+ from customer_retention.core.compat import is_spark_available
64
+ if not is_spark_available():
65
+ return self._transform_local(df)
66
+ return self._transform_local(df)
67
+
68
+ def _generate_local_code(self) -> str:
69
+ key_str = ", ".join(f"'{k}'" for k in self.key_columns)
70
+ lines = [f"# Deduplicate: {self.rationale}"]
71
+ if self.strategy == "keep_first":
72
+ lines.append(f"df = df.drop_duplicates(subset=[{key_str}], keep='first')")
73
+ elif self.strategy == "keep_last":
74
+ lines.append(f"df = df.drop_duplicates(subset=[{key_str}], keep='last')")
75
+ elif self.strategy == "keep_most_recent" and self.timestamp_column:
76
+ lines.append(f"df = df.sort_values('{self.timestamp_column}', ascending=False)")
77
+ lines.append(f"df = df.drop_duplicates(subset=[{key_str}], keep='first')")
78
+ lines.append("df = df.sort_index()")
79
+ elif self.strategy == "drop_exact":
80
+ lines.append(f"df = df.drop_duplicates(subset=[{key_str}], keep='first')")
81
+ return "\n".join(lines)
82
+
83
+ def _generate_databricks_code(self) -> str:
84
+ key_str = ", ".join(f"'{k}'" for k in self.key_columns)
85
+ lines = [f"# Deduplicate: {self.rationale}"]
86
+ if self.strategy == "keep_most_recent" and self.timestamp_column:
87
+ lines.append("from pyspark.sql.window import Window")
88
+ lines.append("from pyspark.sql.functions import row_number, desc")
89
+ lines.append(f"window = Window.partitionBy([{key_str}]).orderBy(desc('{self.timestamp_column}'))")
90
+ lines.append("df = df.withColumn('_row_num', row_number().over(window))")
91
+ lines.append("df = df.filter(df._row_num == 1).drop('_row_num')")
92
+ else:
93
+ lines.append(f"df = df.dropDuplicates([{key_str}])")
94
+ return "\n".join(lines)
@@ -0,0 +1,67 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ import pandas as pd
4
+
5
+ from ..base import CleaningRecommendation, RecommendationResult
6
+
7
+
8
+ class ImputeRecommendation(CleaningRecommendation):
9
+ def __init__(
10
+ self, columns: List[str], rationale: str = None, strategy: str = "median",
11
+ fill_value: Any = None, evidence: List[str] = None,
12
+ priority: str = "medium", source_finding: Optional[Any] = None
13
+ ):
14
+ rationale = rationale or f"Impute missing values using {strategy}"
15
+ super().__init__(columns, rationale, evidence, priority, source_finding)
16
+ self.strategy = strategy
17
+ self.fill_value = fill_value
18
+ self._impute_values: Dict[str, Any] = {}
19
+
20
+ @property
21
+ def recommendation_type(self) -> str:
22
+ return f"impute_{self.strategy}"
23
+
24
+ def _fit_impl(self, df: pd.DataFrame) -> None:
25
+ for col in self.columns:
26
+ if col not in df.columns:
27
+ continue
28
+ series = df[col]
29
+ if self.strategy == "median":
30
+ self._impute_values[col] = series.median()
31
+ elif self.strategy == "mean":
32
+ self._impute_values[col] = series.mean()
33
+ elif self.strategy == "mode":
34
+ modes = series.mode()
35
+ self._impute_values[col] = modes.iloc[0] if len(modes) > 0 else None
36
+ elif self.strategy == "constant":
37
+ self._impute_values[col] = self.fill_value
38
+ self._fit_params["impute_values"] = self._impute_values
39
+
40
+ def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
41
+ df = df.copy()
42
+ rows_before = len(df)
43
+ nulls_imputed = {}
44
+ for col in self.columns:
45
+ if col in df.columns and col in self._impute_values:
46
+ nulls = int(df[col].isna().sum())
47
+ df[col] = df[col].fillna(self._impute_values[col])
48
+ nulls_imputed[col] = nulls
49
+ return RecommendationResult(
50
+ data=df, columns_affected=self.columns, rows_before=rows_before,
51
+ rows_after=len(df), metadata={"nulls_imputed": nulls_imputed}
52
+ )
53
+
54
+ def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
55
+ from customer_retention.core.compat import is_spark_available
56
+ if not is_spark_available():
57
+ return self._transform_local(df)
58
+ return self._transform_local(df)
59
+
60
+ def _generate_local_code(self) -> str:
61
+ lines = [f"# Impute: {self.rationale}"]
62
+ for col, val in self._impute_values.items():
63
+ lines.append(f"df['{col}'] = df['{col}'].fillna({repr(val)})")
64
+ return "\n".join(lines)
65
+
66
+ def _generate_databricks_code(self) -> str:
67
+ return f"# Impute: {self.rationale}\ndf = df.fillna({self._impute_values})"
@@ -0,0 +1,71 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ import pandas as pd
4
+
5
+ from ..base import CleaningRecommendation, RecommendationResult
6
+
7
+
8
+ class OutlierCapRecommendation(CleaningRecommendation):
9
+ def __init__(
10
+ self, columns: List[str], rationale: str = None, percentile: int = 99,
11
+ evidence: List[str] = None, priority: str = "medium", source_finding: Optional[Any] = None
12
+ ):
13
+ rationale = rationale or f"Cap outliers at {percentile}th percentile"
14
+ super().__init__(columns, rationale, evidence, priority, source_finding)
15
+ self.percentile = percentile
16
+ self._bounds: Dict[str, Dict[str, float]] = {}
17
+
18
+ @property
19
+ def recommendation_type(self) -> str:
20
+ return f"cap_outliers_{self.percentile}"
21
+
22
+ def _fit_impl(self, df: pd.DataFrame) -> None:
23
+ lower_pct = (100 - self.percentile) / 100
24
+ upper_pct = self.percentile / 100
25
+ for col in self.columns:
26
+ if col not in df.columns:
27
+ continue
28
+ series = df[col].dropna()
29
+ self._bounds[col] = {
30
+ "lower": float(series.quantile(lower_pct)),
31
+ "upper": float(series.quantile(upper_pct)),
32
+ }
33
+ self._fit_params["bounds"] = self._bounds
34
+
35
+ def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
36
+ df = df.copy()
37
+ rows_before = len(df)
38
+ outliers_capped = {}
39
+ for col in self.columns:
40
+ if col in df.columns and col in self._bounds:
41
+ bounds = self._bounds[col]
42
+ outlier_mask = (df[col] < bounds["lower"]) | (df[col] > bounds["upper"])
43
+ outliers_capped[col] = int(outlier_mask.sum())
44
+ df[col] = df[col].clip(lower=bounds["lower"], upper=bounds["upper"])
45
+ return RecommendationResult(
46
+ data=df, columns_affected=self.columns, rows_before=rows_before,
47
+ rows_after=len(df), metadata={"outliers_capped": outliers_capped, "bounds": self._bounds}
48
+ )
49
+
50
+ def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
51
+ from customer_retention.core.compat import is_spark_available
52
+ if not is_spark_available():
53
+ return self._transform_local(df)
54
+ return self._transform_local(df)
55
+
56
+ def _generate_local_code(self) -> str:
57
+ lines = [f"# Cap outliers: {self.rationale}"]
58
+ for col, bounds in self._bounds.items():
59
+ lines.append(f"df['{col}'] = df['{col}'].clip(lower={bounds['lower']}, upper={bounds['upper']})")
60
+ return "\n".join(lines)
61
+
62
+ def _generate_databricks_code(self) -> str:
63
+ lines = [f"# Cap outliers: {self.rationale}", "from pyspark.sql.functions import when, col"]
64
+ for col, bounds in self._bounds.items():
65
+ lines.append(
66
+ f"df = df.withColumn('{col}', "
67
+ f"when(col('{col}') < {bounds['lower']}, {bounds['lower']})"
68
+ f".when(col('{col}') > {bounds['upper']}, {bounds['upper']})"
69
+ f".otherwise(col('{col}')))"
70
+ )
71
+ return "\n".join(lines)
@@ -0,0 +1,3 @@
1
+ from .extract import DaysSinceRecommendation, ExtractDayOfWeekRecommendation, ExtractMonthRecommendation
2
+
3
+ __all__ = ["ExtractMonthRecommendation", "ExtractDayOfWeekRecommendation", "DaysSinceRecommendation"]
@@ -0,0 +1,149 @@
1
+ from datetime import datetime
2
+ from typing import Any, List, Optional
3
+
4
+ import pandas as pd
5
+
6
+ from ..base import DatetimeRecommendation, RecommendationResult
7
+
8
+
9
+ class ExtractMonthRecommendation(DatetimeRecommendation):
10
+ def __init__(
11
+ self, columns: List[str], rationale: str = None, evidence: List[str] = None,
12
+ priority: str = "medium", source_finding: Optional[Any] = None
13
+ ):
14
+ rationale = rationale or "Extract month from datetime for seasonality analysis"
15
+ super().__init__(columns, rationale, evidence, priority, source_finding)
16
+
17
+ @property
18
+ def recommendation_type(self) -> str:
19
+ return "extract_month"
20
+
21
+ def _fit_impl(self, df: pd.DataFrame) -> None:
22
+ self._fit_params["columns"] = self.columns
23
+
24
+ def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
25
+ df = df.copy()
26
+ new_cols = []
27
+ for col in self.columns:
28
+ if col in df.columns:
29
+ new_col = f"{col}_month"
30
+ df[new_col] = pd.to_datetime(df[col]).dt.month
31
+ new_cols.append(new_col)
32
+ return RecommendationResult(
33
+ data=df, columns_affected=self.columns + new_cols,
34
+ rows_before=len(df), rows_after=len(df), metadata={"new_columns": new_cols}
35
+ )
36
+
37
+ def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
38
+ from customer_retention.core.compat import is_spark_available
39
+ if not is_spark_available():
40
+ return self._transform_local(df)
41
+ return self._transform_local(df)
42
+
43
+ def _generate_local_code(self) -> str:
44
+ lines = [f"# Extract: {self.rationale}"]
45
+ for col in self.columns:
46
+ lines.append(f"df['{col}_month'] = pd.to_datetime(df['{col}']).dt.month")
47
+ return "\n".join(lines)
48
+
49
+ def _generate_databricks_code(self) -> str:
50
+ lines = [f"# Extract: {self.rationale}", "from pyspark.sql.functions import month, col"]
51
+ for col in self.columns:
52
+ lines.append(f"df = df.withColumn('{col}_month', month(col('{col}')))")
53
+ return "\n".join(lines)
54
+
55
+
56
+ class ExtractDayOfWeekRecommendation(DatetimeRecommendation):
57
+ def __init__(
58
+ self, columns: List[str], rationale: str = None, evidence: List[str] = None,
59
+ priority: str = "medium", source_finding: Optional[Any] = None
60
+ ):
61
+ rationale = rationale or "Extract day of week from datetime for weekly patterns"
62
+ super().__init__(columns, rationale, evidence, priority, source_finding)
63
+
64
+ @property
65
+ def recommendation_type(self) -> str:
66
+ return "extract_dayofweek"
67
+
68
+ def _fit_impl(self, df: pd.DataFrame) -> None:
69
+ self._fit_params["columns"] = self.columns
70
+
71
+ def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
72
+ df = df.copy()
73
+ new_cols = []
74
+ for col in self.columns:
75
+ if col in df.columns:
76
+ new_col = f"{col}_dayofweek"
77
+ df[new_col] = pd.to_datetime(df[col]).dt.dayofweek
78
+ new_cols.append(new_col)
79
+ return RecommendationResult(
80
+ data=df, columns_affected=self.columns + new_cols,
81
+ rows_before=len(df), rows_after=len(df), metadata={"new_columns": new_cols}
82
+ )
83
+
84
+ def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
85
+ from customer_retention.core.compat import is_spark_available
86
+ if not is_spark_available():
87
+ return self._transform_local(df)
88
+ return self._transform_local(df)
89
+
90
+ def _generate_local_code(self) -> str:
91
+ lines = [f"# Extract: {self.rationale}"]
92
+ for col in self.columns:
93
+ lines.append(f"df['{col}_dayofweek'] = pd.to_datetime(df['{col}']).dt.dayofweek")
94
+ return "\n".join(lines)
95
+
96
+ def _generate_databricks_code(self) -> str:
97
+ lines = [f"# Extract: {self.rationale}", "from pyspark.sql.functions import dayofweek, col"]
98
+ for col in self.columns:
99
+ lines.append(f"df = df.withColumn('{col}_dayofweek', dayofweek(col('{col}')) - 1)")
100
+ return "\n".join(lines)
101
+
102
+
103
+ class DaysSinceRecommendation(DatetimeRecommendation):
104
+ def __init__(
105
+ self, columns: List[str], rationale: str = None, reference_date: datetime = None,
106
+ evidence: List[str] = None, priority: str = "medium", source_finding: Optional[Any] = None
107
+ ):
108
+ rationale = rationale or "Calculate days since datetime for recency features"
109
+ super().__init__(columns, rationale, evidence, priority, source_finding)
110
+ self.reference_date = reference_date or datetime.now()
111
+
112
+ @property
113
+ def recommendation_type(self) -> str:
114
+ return "days_since"
115
+
116
+ def _fit_impl(self, df: pd.DataFrame) -> None:
117
+ self._fit_params["reference_date"] = str(self.reference_date)
118
+
119
+ def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
120
+ df = df.copy()
121
+ new_cols = []
122
+ for col in self.columns:
123
+ if col in df.columns:
124
+ new_col = f"{col}_days_since"
125
+ df[new_col] = (pd.Timestamp(self.reference_date) - pd.to_datetime(df[col])).dt.days
126
+ new_cols.append(new_col)
127
+ return RecommendationResult(
128
+ data=df, columns_affected=self.columns + new_cols,
129
+ rows_before=len(df), rows_after=len(df), metadata={"reference_date": str(self.reference_date), "new_columns": new_cols}
130
+ )
131
+
132
+ def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
133
+ from customer_retention.core.compat import is_spark_available
134
+ if not is_spark_available():
135
+ return self._transform_local(df)
136
+ return self._transform_local(df)
137
+
138
+ def _generate_local_code(self) -> str:
139
+ lines = [f"# Extract: {self.rationale}", f"reference_date = pd.Timestamp('{self.reference_date}')"]
140
+ for col in self.columns:
141
+ lines.append(f"df['{col}_days_since'] = (reference_date - pd.to_datetime(df['{col}'])).dt.days")
142
+ return "\n".join(lines)
143
+
144
+ def _generate_databricks_code(self) -> str:
145
+ lines = [f"# Extract: {self.rationale}", "from pyspark.sql.functions import datediff, lit, col, to_date"]
146
+ lines.append(f"reference_date = '{self.reference_date.strftime('%Y-%m-%d')}'")
147
+ for col in self.columns:
148
+ lines.append(f"df = df.withColumn('{col}_days_since', datediff(lit(reference_date), to_date(col('{col}'))))")
149
+ return "\n".join(lines)
@@ -0,0 +1,3 @@
1
+ from .categorical import LabelEncodeRecommendation, OneHotEncodeRecommendation
2
+
3
+ __all__ = ["OneHotEncodeRecommendation", "LabelEncodeRecommendation"]
@@ -0,0 +1,114 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ import pandas as pd
4
+
5
+ from ..base import EncodingRecommendation, RecommendationResult
6
+
7
+
8
+ class OneHotEncodeRecommendation(EncodingRecommendation):
9
+ def __init__(
10
+ self, columns: List[str], rationale: str = None, drop_first: bool = False,
11
+ evidence: List[str] = None, priority: str = "medium", source_finding: Optional[Any] = None
12
+ ):
13
+ rationale = rationale or "One-hot encode categorical features"
14
+ super().__init__(columns, rationale, evidence, priority, source_finding)
15
+ self.drop_first = drop_first
16
+ self._categories: Dict[str, List[str]] = {}
17
+
18
+ @property
19
+ def recommendation_type(self) -> str:
20
+ return "onehot_encode"
21
+
22
+ def _fit_impl(self, df: pd.DataFrame) -> None:
23
+ for col in self.columns:
24
+ if col in df.columns:
25
+ self._categories[col] = list(df[col].dropna().unique())
26
+ self._fit_params["categories"] = self._categories
27
+
28
+ def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
29
+ df = df.copy()
30
+ rows_before = len(df)
31
+ new_cols = []
32
+ for col in self.columns:
33
+ if col in df.columns:
34
+ dummies = pd.get_dummies(df[col], prefix=col, drop_first=self.drop_first)
35
+ new_cols.extend(dummies.columns.tolist())
36
+ df = pd.concat([df, dummies], axis=1)
37
+ df = df.drop(columns=[col])
38
+ return RecommendationResult(
39
+ data=df, columns_affected=self.columns + new_cols, rows_before=rows_before,
40
+ rows_after=len(df), metadata={"categories": self._categories, "new_columns": new_cols}
41
+ )
42
+
43
+ def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
44
+ from customer_retention.core.compat import is_spark_available
45
+ if not is_spark_available():
46
+ return self._transform_local(df)
47
+ return self._transform_local(df)
48
+
49
+ def _generate_local_code(self) -> str:
50
+ lines = [f"# Encode: {self.rationale}"]
51
+ for col in self.columns:
52
+ lines.append(f"df = pd.concat([df, pd.get_dummies(df['{col}'], prefix='{col}')], axis=1).drop(columns=['{col}'])")
53
+ return "\n".join(lines)
54
+
55
+ def _generate_databricks_code(self) -> str:
56
+ lines = [f"# Encode: {self.rationale}", "from pyspark.ml.feature import StringIndexer, OneHotEncoder"]
57
+ for col in self.columns:
58
+ lines.append(f"indexer = StringIndexer(inputCol='{col}', outputCol='{col}_idx')")
59
+ lines.append(f"encoder = OneHotEncoder(inputCol='{col}_idx', outputCol='{col}_vec')")
60
+ lines.append("df = encoder.fit(indexer.fit(df).transform(df)).transform(indexer.fit(df).transform(df))")
61
+ return "\n".join(lines)
62
+
63
+
64
+ class LabelEncodeRecommendation(EncodingRecommendation):
65
+ def __init__(
66
+ self, columns: List[str], rationale: str = None, evidence: List[str] = None,
67
+ priority: str = "medium", source_finding: Optional[Any] = None
68
+ ):
69
+ rationale = rationale or "Label encode categorical features to integers"
70
+ super().__init__(columns, rationale, evidence, priority, source_finding)
71
+ self._mappings: Dict[str, Dict[str, int]] = {}
72
+
73
+ @property
74
+ def recommendation_type(self) -> str:
75
+ return "label_encode"
76
+
77
+ def _fit_impl(self, df: pd.DataFrame) -> None:
78
+ for col in self.columns:
79
+ if col in df.columns:
80
+ categories = sorted(df[col].dropna().unique())
81
+ self._mappings[col] = {cat: idx for idx, cat in enumerate(categories)}
82
+ self._fit_params["mappings"] = self._mappings
83
+
84
+ def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
85
+ df = df.copy()
86
+ rows_before = len(df)
87
+ for col in self.columns:
88
+ if col in df.columns and col in self._mappings:
89
+ df[col] = df[col].map(self._mappings[col])
90
+ return RecommendationResult(
91
+ data=df, columns_affected=self.columns, rows_before=rows_before,
92
+ rows_after=len(df), metadata={"mappings": self._mappings}
93
+ )
94
+
95
+ def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
96
+ from customer_retention.core.compat import is_spark_available
97
+ if not is_spark_available():
98
+ return self._transform_local(df)
99
+ return self._transform_local(df)
100
+
101
+ def _generate_local_code(self) -> str:
102
+ lines = [f"# Encode: {self.rationale}", "from sklearn.preprocessing import LabelEncoder"]
103
+ for col in self.columns:
104
+ lines.append(f"le_{col} = LabelEncoder()")
105
+ lines.append(f"df['{col}'] = le_{col}.fit_transform(df['{col}'].astype(str))")
106
+ return "\n".join(lines)
107
+
108
+ def _generate_databricks_code(self) -> str:
109
+ lines = [f"# Encode: {self.rationale}", "from pyspark.ml.feature import StringIndexer"]
110
+ for col in self.columns:
111
+ lines.append(f"indexer = StringIndexer(inputCol='{col}', outputCol='{col}_idx')")
112
+ lines.append("df = indexer.fit(df).transform(df)")
113
+ lines.append(f"df = df.drop('{col}').withColumnRenamed('{col}_idx', '{col}')")
114
+ return "\n".join(lines)