churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,47 @@
1
+ """Lean runtime module for DataFrame transformations.
2
+
3
+ This module is safe to import from generated pipelines — it depends only on
4
+ ``core.compat``, ``numpy``, ``sklearn``, ``joblib``, and the lightweight
5
+ ``models`` dataclasses. It does **not** import from ``analysis/``,
6
+ ``generators/`` (except ``models``), ``visualization/``, or ``stages/``.
7
+ """
8
+
9
+ from .artifact_store import ArtifactStore
10
+ from .executor import TransformExecutor
11
+ from .ops import (
12
+ apply_cap_outlier,
13
+ apply_cap_then_log,
14
+ apply_derived_composite,
15
+ apply_derived_interaction,
16
+ apply_derived_ratio,
17
+ apply_drop_column,
18
+ apply_feature_select,
19
+ apply_impute_null,
20
+ apply_log_transform,
21
+ apply_one_hot_encode,
22
+ apply_segment_aware_cap,
23
+ apply_sqrt_transform,
24
+ apply_type_cast,
25
+ apply_winsorize,
26
+ apply_zero_inflation_handling,
27
+ )
28
+
29
+ __all__ = [
30
+ "TransformExecutor",
31
+ "ArtifactStore",
32
+ "apply_impute_null",
33
+ "apply_cap_outlier",
34
+ "apply_type_cast",
35
+ "apply_drop_column",
36
+ "apply_winsorize",
37
+ "apply_segment_aware_cap",
38
+ "apply_log_transform",
39
+ "apply_sqrt_transform",
40
+ "apply_zero_inflation_handling",
41
+ "apply_cap_then_log",
42
+ "apply_one_hot_encode",
43
+ "apply_feature_select",
44
+ "apply_derived_ratio",
45
+ "apply_derived_interaction",
46
+ "apply_derived_composite",
47
+ ]
@@ -0,0 +1,50 @@
1
+ """Persistence for fitted transformers.
2
+
3
+ Drop-in replacement for the ``FitArtifactRegistry`` class that was
4
+ previously inlined in ``gold.py.j2`` and ``run_scoring.py.j2`` templates.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+ import joblib
12
+ import yaml
13
+
14
+
15
+ class ArtifactStore:
16
+ """Manages persistence of fitted transformers (scalers, encoders, etc.)."""
17
+
18
+ def __init__(self, artifacts_dir: str | Path):
19
+ self._dir = Path(artifacts_dir)
20
+ self._dir.mkdir(parents=True, exist_ok=True)
21
+ self._manifest: dict = {}
22
+
23
+ def register(self, artifact_type: str, target_column: str, transformer) -> None:
24
+ artifact_id = f"{target_column}_{artifact_type}"
25
+ path = self._dir / f"{artifact_id}.joblib"
26
+ joblib.dump(transformer, path)
27
+ self._manifest[artifact_id] = {
28
+ "type": artifact_type,
29
+ "column": target_column,
30
+ "path": str(path),
31
+ }
32
+
33
+ def save_manifest(self) -> None:
34
+ with open(self._dir / "manifest.yaml", "w") as f:
35
+ yaml.dump(self._manifest, f)
36
+
37
+ def load(self, artifact_id: str):
38
+ if artifact_id not in self._manifest:
39
+ raise KeyError(f"Artifact {artifact_id} not found")
40
+ return joblib.load(self._manifest[artifact_id]["path"])
41
+
42
+ def has(self, artifact_id: str) -> bool:
43
+ return artifact_id in self._manifest
44
+
45
+ @classmethod
46
+ def from_manifest(cls, manifest_path: str | Path) -> ArtifactStore:
47
+ store = cls(str(Path(manifest_path).parent))
48
+ with open(manifest_path) as f:
49
+ store._manifest = yaml.safe_load(f) or {}
50
+ return store
@@ -0,0 +1,157 @@
1
+ """TransformExecutor — single dispatch table for all transformation types.
2
+
3
+ Maps :class:`TransformationStep` types to the appropriate function in
4
+ :mod:`ops` (stateless) or class in :mod:`fitted` (stateful).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from customer_retention.core.compat import DataFrame
10
+ from customer_retention.generators.pipeline_generator.models import (
11
+ PipelineTransformationType,
12
+ TransformationStep,
13
+ )
14
+
15
+ from . import ops
16
+ from .artifact_store import ArtifactStore
17
+ from .fitted import FittedEncoder, FittedPowerTransform, FittedScaler
18
+
19
+
20
+ class TransformExecutor:
21
+ """Applies :class:`TransformationStep` objects to DataFrames.
22
+
23
+ Works with both pandas and pyspark.pandas DataFrames via the compat
24
+ layer. Used by generated pipelines and exploration code alike.
25
+ """
26
+
27
+ def apply(
28
+ self,
29
+ df: DataFrame,
30
+ step: TransformationStep,
31
+ *,
32
+ fit_mode: bool = False,
33
+ artifact_store: ArtifactStore | None = None,
34
+ ) -> DataFrame:
35
+ handler = self._DISPATCH.get(step.type)
36
+ if handler is None:
37
+ raise ValueError(f"Unknown transformation type: {step.type}")
38
+ return handler(self, df, step, fit_mode=fit_mode, artifact_store=artifact_store)
39
+
40
+ def apply_all(
41
+ self,
42
+ df: DataFrame,
43
+ steps: list[TransformationStep],
44
+ *,
45
+ fit_mode: bool = False,
46
+ artifact_store: ArtifactStore | None = None,
47
+ ) -> DataFrame:
48
+ for step in steps:
49
+ df = self.apply(df, step, fit_mode=fit_mode, artifact_store=artifact_store)
50
+ return df
51
+
52
+ def _apply_fitted(self, fitted, df, step, *, fit_mode=False, artifact_store=None):
53
+ if fit_mode:
54
+ return fitted.fit_transform(df, step.column, artifact_store)
55
+ return fitted.transform(df, step.column, artifact_store)
56
+
57
+ def _handle_impute_null(self, df, step, **kw):
58
+ return ops.apply_impute_null(df, step.column, value=step.parameters.get("value", 0))
59
+
60
+ def _handle_cap_outlier(self, df, step, **kw):
61
+ return ops.apply_cap_outlier(
62
+ df,
63
+ step.column,
64
+ lower=step.parameters.get("lower", 0),
65
+ upper=step.parameters.get("upper", 1_000_000),
66
+ )
67
+
68
+ def _handle_type_cast(self, df, step, **kw):
69
+ return ops.apply_type_cast(df, step.column, dtype=step.parameters.get("dtype", "float"))
70
+
71
+ def _handle_drop_column(self, df, step, **kw):
72
+ return ops.apply_drop_column(df, step.column)
73
+
74
+ def _handle_winsorize(self, df, step, **kw):
75
+ return ops.apply_winsorize(
76
+ df,
77
+ step.column,
78
+ lower_bound=step.parameters.get("lower_bound", 0),
79
+ upper_bound=step.parameters.get("upper_bound", 1_000_000),
80
+ )
81
+
82
+ def _handle_segment_aware_cap(self, df, step, **kw):
83
+ return ops.apply_segment_aware_cap(
84
+ df, step.column, n_segments=step.parameters.get("n_segments", 2)
85
+ )
86
+
87
+ def _handle_log_transform(self, df, step, **kw):
88
+ return ops.apply_log_transform(df, step.column)
89
+
90
+ def _handle_sqrt_transform(self, df, step, **kw):
91
+ return ops.apply_sqrt_transform(df, step.column)
92
+
93
+ def _handle_zero_inflation(self, df, step, **kw):
94
+ return ops.apply_zero_inflation_handling(df, step.column)
95
+
96
+ def _handle_cap_then_log(self, df, step, **kw):
97
+ return ops.apply_cap_then_log(df, step.column)
98
+
99
+ def _handle_encode(self, df, step, *, fit_mode=False, artifact_store=None, **kw):
100
+ method = step.parameters.get("method", "one_hot")
101
+ if method == "one_hot":
102
+ return ops.apply_one_hot_encode(df, step.column)
103
+ return self._apply_fitted(
104
+ FittedEncoder(), df, step, fit_mode=fit_mode, artifact_store=artifact_store
105
+ )
106
+
107
+ def _handle_scale(self, df, step, *, fit_mode=False, artifact_store=None, **kw):
108
+ method = step.parameters.get("method", "standard")
109
+ return self._apply_fitted(
110
+ FittedScaler(method), df, step, fit_mode=fit_mode, artifact_store=artifact_store
111
+ )
112
+
113
+ def _handle_yeo_johnson(self, df, step, *, fit_mode=False, artifact_store=None, **kw):
114
+ return self._apply_fitted(
115
+ FittedPowerTransform(), df, step, fit_mode=fit_mode, artifact_store=artifact_store
116
+ )
117
+
118
+ def _handle_feature_select(self, df, step, **kw):
119
+ return ops.apply_feature_select(df, step.column)
120
+
121
+ def _handle_derived_column(self, df, step, **kw):
122
+ method = step.parameters.get("method") or step.parameters.get("action")
123
+ if method == "ratio":
124
+ return ops.apply_derived_ratio(
125
+ df,
126
+ step.column,
127
+ numerator=step.parameters.get("numerator", ""),
128
+ denominator=step.parameters.get("denominator", ""),
129
+ )
130
+ if method == "interaction":
131
+ return ops.apply_derived_interaction(
132
+ df,
133
+ step.column,
134
+ col_a=step.parameters.get("col_a", ""),
135
+ col_b=step.parameters.get("col_b", ""),
136
+ )
137
+ if method == "composite":
138
+ return ops.apply_derived_composite(df, step.column, columns=step.parameters.get("columns", []))
139
+ return df
140
+
141
+ _DISPATCH = {
142
+ PipelineTransformationType.IMPUTE_NULL: _handle_impute_null,
143
+ PipelineTransformationType.CAP_OUTLIER: _handle_cap_outlier,
144
+ PipelineTransformationType.TYPE_CAST: _handle_type_cast,
145
+ PipelineTransformationType.DROP_COLUMN: _handle_drop_column,
146
+ PipelineTransformationType.WINSORIZE: _handle_winsorize,
147
+ PipelineTransformationType.SEGMENT_AWARE_CAP: _handle_segment_aware_cap,
148
+ PipelineTransformationType.LOG_TRANSFORM: _handle_log_transform,
149
+ PipelineTransformationType.SQRT_TRANSFORM: _handle_sqrt_transform,
150
+ PipelineTransformationType.YEO_JOHNSON: _handle_yeo_johnson,
151
+ PipelineTransformationType.ZERO_INFLATION_HANDLING: _handle_zero_inflation,
152
+ PipelineTransformationType.CAP_THEN_LOG: _handle_cap_then_log,
153
+ PipelineTransformationType.ENCODE: _handle_encode,
154
+ PipelineTransformationType.SCALE: _handle_scale,
155
+ PipelineTransformationType.FEATURE_SELECT: _handle_feature_select,
156
+ PipelineTransformationType.DERIVED_COLUMN: _handle_derived_column,
157
+ }
@@ -0,0 +1,92 @@
1
+ """Stateful fit/transform classes.
2
+
3
+ These wrap sklearn transformers and integrate with :class:`ArtifactStore`
4
+ for persistence. All classes use ``core.compat.to_pandas`` to convert
5
+ DataFrames to numpy-backed pandas before calling sklearn, ensuring the
6
+ same source code works on both pandas and pyspark.pandas.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from sklearn.preprocessing import (
12
+ LabelEncoder,
13
+ MinMaxScaler,
14
+ PowerTransformer,
15
+ StandardScaler,
16
+ )
17
+
18
+ from customer_retention.core.compat import DataFrame, ensure_pandas_series, to_pandas
19
+
20
+
21
+ class FittedScaler:
22
+ """Wraps :class:`StandardScaler` / :class:`MinMaxScaler`."""
23
+
24
+ def __init__(self, method: str = "standard"):
25
+ self.method = method
26
+ self._scaler = StandardScaler() if method == "standard" else MinMaxScaler()
27
+
28
+ def fit_transform(self, df: DataFrame, column: str, artifact_store) -> DataFrame:
29
+ if column not in df.columns:
30
+ return df
31
+ col_values = to_pandas(df[[column]])
32
+ fitted = self._scaler.fit_transform(col_values)
33
+ df[column] = fitted.ravel()
34
+ artifact_store.register("scaler", column, self._scaler)
35
+ return df
36
+
37
+ def transform(self, df: DataFrame, column: str, artifact_store) -> DataFrame:
38
+ if column not in df.columns:
39
+ return df
40
+ scaler = artifact_store.load(f"{column}_scaler")
41
+ col_values = to_pandas(df[[column]])
42
+ df[column] = scaler.transform(col_values).ravel()
43
+ return df
44
+
45
+
46
+ class FittedEncoder:
47
+ """Wraps :class:`LabelEncoder` with unknown-class fallback."""
48
+
49
+ def __init__(self):
50
+ self._encoder = LabelEncoder()
51
+
52
+ def fit_transform(self, df: DataFrame, column: str, artifact_store) -> DataFrame:
53
+ if column not in df.columns:
54
+ return df
55
+ series = ensure_pandas_series(df[column].astype(str))
56
+ df[column] = self._encoder.fit_transform(series)
57
+ artifact_store.register("encoder", column, self._encoder)
58
+ return df
59
+
60
+ def transform(self, df: DataFrame, column: str, artifact_store) -> DataFrame:
61
+ if column not in df.columns:
62
+ return df
63
+ encoder = artifact_store.load(f"{column}_encoder")
64
+ series = ensure_pandas_series(df[column].astype(str))
65
+ df[column] = series.apply(
66
+ lambda x: encoder.transform([x])[0] if x in encoder.classes_ else 0
67
+ )
68
+ return df
69
+
70
+
71
+ class FittedPowerTransform:
72
+ """Wraps Yeo-Johnson :class:`PowerTransformer`."""
73
+
74
+ def __init__(self):
75
+ self._pt = PowerTransformer(method="yeo-johnson")
76
+
77
+ def fit_transform(self, df: DataFrame, column: str, artifact_store) -> DataFrame:
78
+ if column not in df.columns:
79
+ return df
80
+ col_values = to_pandas(df[[column]].fillna(0))
81
+ fitted = self._pt.fit_transform(col_values)
82
+ df[column] = fitted.ravel()
83
+ artifact_store.register("power_transformer", column, self._pt)
84
+ return df
85
+
86
+ def transform(self, df: DataFrame, column: str, artifact_store) -> DataFrame:
87
+ if column not in df.columns:
88
+ return df
89
+ pt = artifact_store.load(f"{column}_power_transformer")
90
+ col_values = to_pandas(df[[column]].fillna(0))
91
+ df[column] = pt.transform(col_values).ravel()
92
+ return df
@@ -0,0 +1,148 @@
1
+ """Stateless transformation functions.
2
+
3
+ Each function takes (df, column, **params) and returns df.
4
+ Uses core.compat for platform-agnostic DataFrame operations
5
+ (works on both pandas and pyspark.pandas).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import functools
11
+ from typing import Any
12
+
13
+ import numpy as np
14
+
15
+ from customer_retention.core.compat import DataFrame, pd
16
+
17
+
18
+ def _requires_column(fn):
19
+ @functools.wraps(fn)
20
+ def wrapper(df: DataFrame, column: str, *args, **kwargs) -> DataFrame:
21
+ if column not in df.columns:
22
+ return df
23
+ return fn(df, column, *args, **kwargs)
24
+ return wrapper
25
+
26
+
27
+ @_requires_column
28
+ def apply_impute_null(df: DataFrame, column: str, *, value: Any = 0) -> DataFrame:
29
+ if value == "median":
30
+ df[column] = df[column].fillna(df[column].median())
31
+ else:
32
+ df[column] = df[column].fillna(value)
33
+ return df
34
+
35
+
36
+ @_requires_column
37
+ def apply_cap_outlier(
38
+ df: DataFrame, column: str, *, lower: float = 0, upper: float = 1_000_000
39
+ ) -> DataFrame:
40
+ df[column] = df[column].clip(lower=lower, upper=upper)
41
+ return df
42
+
43
+
44
+ def apply_type_cast(df: DataFrame, column: str, *, dtype: str = "float") -> DataFrame:
45
+ if column not in df.columns:
46
+ return df
47
+ df[column] = df[column].astype(dtype)
48
+ return df
49
+
50
+
51
+ def apply_drop_column(df: DataFrame, column: str) -> DataFrame:
52
+ return df.drop(columns=[column], errors="ignore")
53
+
54
+
55
+ @_requires_column
56
+ def apply_winsorize(
57
+ df: DataFrame, column: str, *, lower_bound: float = 0, upper_bound: float = 1_000_000
58
+ ) -> DataFrame:
59
+ df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
60
+ return df
61
+
62
+
63
+ def apply_segment_aware_cap(df: DataFrame, column: str, *, n_segments: int = 2) -> DataFrame:
64
+ if column not in df.columns:
65
+ return df
66
+ from sklearn.cluster import KMeans
67
+
68
+ valid = df[column].dropna()
69
+ if len(valid) < n_segments:
70
+ return df
71
+
72
+ labels = KMeans(n_clusters=n_segments, random_state=42, n_init=10).fit_predict(
73
+ valid.values.reshape(-1, 1)
74
+ )
75
+ df = df.copy()
76
+ for seg in range(n_segments):
77
+ mask = pd.Series(False, index=df.index)
78
+ mask.iloc[valid.index[labels == seg]] = True
79
+ seg_vals = df.loc[mask, column]
80
+ q1, q3 = seg_vals.quantile(0.25), seg_vals.quantile(0.75)
81
+ iqr = q3 - q1
82
+ lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
83
+ df.loc[mask, column] = df.loc[mask, column].clip(lower=lower, upper=upper)
84
+ return df
85
+
86
+
87
+ @_requires_column
88
+ def apply_log_transform(df: DataFrame, column: str) -> DataFrame:
89
+ df[column] = np.log1p(df[column].clip(lower=0))
90
+ return df
91
+
92
+
93
+ @_requires_column
94
+ def apply_sqrt_transform(df: DataFrame, column: str) -> DataFrame:
95
+ df[column] = np.sqrt(df[column].clip(lower=0))
96
+ return df
97
+
98
+
99
+ @_requires_column
100
+ def apply_zero_inflation_handling(df: DataFrame, column: str) -> DataFrame:
101
+ df[f"{column}_is_zero"] = (df[column] == 0).astype(int)
102
+ nonzero = df[column] != 0
103
+ df.loc[nonzero, column] = np.log1p(df.loc[nonzero, column].clip(lower=0))
104
+ return df
105
+
106
+
107
+ @_requires_column
108
+ def apply_cap_then_log(df: DataFrame, column: str) -> DataFrame:
109
+ q99 = df[column].quantile(0.99)
110
+ df[column] = np.log1p(df[column].clip(upper=q99).clip(lower=0))
111
+ return df
112
+
113
+
114
+ @_requires_column
115
+ def apply_one_hot_encode(df: DataFrame, column: str) -> DataFrame:
116
+ return pd.get_dummies(df, columns=[column], prefix=column)
117
+
118
+
119
+ def apply_feature_select(df: DataFrame, column: str) -> DataFrame:
120
+ return df.drop(columns=[column], errors="ignore")
121
+
122
+
123
+ def apply_derived_ratio(
124
+ df: DataFrame, column: str, *, numerator: str, denominator: str
125
+ ) -> DataFrame:
126
+ if numerator not in df.columns or denominator not in df.columns:
127
+ return df
128
+ df[column] = df[numerator] / df[denominator].replace(0, float("nan"))
129
+ return df
130
+
131
+
132
+ def apply_derived_interaction(
133
+ df: DataFrame, column: str, *, col_a: str, col_b: str
134
+ ) -> DataFrame:
135
+ if col_a not in df.columns or col_b not in df.columns:
136
+ return df
137
+ df[column] = df[col_a] * df[col_b]
138
+ return df
139
+
140
+
141
+ def apply_derived_composite(
142
+ df: DataFrame, column: str, *, columns: list[str]
143
+ ) -> DataFrame:
144
+ valid = [c for c in columns if c in df.columns]
145
+ if not valid:
146
+ return df
147
+ df[column] = df[valid].mean(axis=1)
148
+ return df