churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,97 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional, Union
3
+
4
+ import numpy as np
5
+
6
+ from customer_retention.core.compat import DataFrame, Series, Timestamp, is_datetime64_any_dtype, pd
7
+
8
+
9
+ @dataclass
10
+ class DatetimeTransformResult:
11
+ df: DataFrame
12
+ extracted_features: list = field(default_factory=list)
13
+ cyclical_features: list = field(default_factory=list)
14
+ drop_original: bool = True
15
+
16
+
17
+ class DatetimeTransformer:
18
+ FEATURE_EXTRACTORS = {
19
+ "year": lambda s: s.dt.year,
20
+ "month": lambda s: s.dt.month,
21
+ "day": lambda s: s.dt.day,
22
+ "day_of_week": lambda s: s.dt.dayofweek,
23
+ "day_of_year": lambda s: s.dt.dayofyear,
24
+ "week_of_year": lambda s: s.dt.isocalendar().week.astype(int),
25
+ "quarter": lambda s: s.dt.quarter,
26
+ "hour": lambda s: s.dt.hour,
27
+ "minute": lambda s: s.dt.minute,
28
+ "is_weekend": lambda s: s.dt.dayofweek.isin([5, 6]).astype(int),
29
+ "is_month_start": lambda s: s.dt.is_month_start.astype(int),
30
+ "is_month_end": lambda s: s.dt.is_month_end.astype(int),
31
+ "is_quarter_start": lambda s: s.dt.is_quarter_start.astype(int),
32
+ "is_quarter_end": lambda s: s.dt.is_quarter_end.astype(int),
33
+ }
34
+
35
+ CYCLICAL_PERIODS = {
36
+ "month": 12,
37
+ "day_of_week": 7,
38
+ "day_of_year": 365,
39
+ "quarter": 4,
40
+ "hour": 24,
41
+ "minute": 60,
42
+ }
43
+
44
+ def __init__(
45
+ self,
46
+ extract_features: Optional[list[str]] = None,
47
+ cyclical_features: Optional[list[str]] = None,
48
+ reference_date: Optional[Union[str, Timestamp]] = None,
49
+ drop_original: bool = True
50
+ ):
51
+ self.extract_features = extract_features or ["year", "month", "day_of_week"]
52
+ self.cyclical_features = cyclical_features or []
53
+ self.reference_date = Timestamp(reference_date) if reference_date else None
54
+ self.drop_original = drop_original
55
+
56
+ def fit(self, series: Series) -> "DatetimeTransformer":
57
+ return self
58
+
59
+ def transform(self, series: Series) -> DatetimeTransformResult:
60
+ return self._apply_transform(series)
61
+
62
+ def fit_transform(self, series: Series) -> DatetimeTransformResult:
63
+ return self._apply_transform(series)
64
+
65
+ def _apply_transform(self, series: Series) -> DatetimeTransformResult:
66
+ dt_series = self._ensure_datetime(series)
67
+ result_dict = {}
68
+ extracted = []
69
+
70
+ for feature in self.extract_features:
71
+ if feature in self.FEATURE_EXTRACTORS:
72
+ values = self.FEATURE_EXTRACTORS[feature](dt_series)
73
+ result_dict[feature] = values
74
+ extracted.append(feature)
75
+
76
+ if feature in self.cyclical_features:
77
+ period = self.CYCLICAL_PERIODS.get(feature)
78
+ if period:
79
+ sin_col = f"{feature}_sin"
80
+ cos_col = f"{feature}_cos"
81
+ result_dict[sin_col] = np.sin(2 * np.pi * values / period)
82
+ result_dict[cos_col] = np.cos(2 * np.pi * values / period)
83
+
84
+ if self.reference_date is not None:
85
+ result_dict["days_since"] = (self.reference_date - dt_series).dt.days
86
+
87
+ df = DataFrame(result_dict)
88
+
89
+ return DatetimeTransformResult(
90
+ df=df, extracted_features=extracted,
91
+ cyclical_features=self.cyclical_features, drop_original=self.drop_original
92
+ )
93
+
94
+ def _ensure_datetime(self, series: Series) -> Series:
95
+ if is_datetime64_any_dtype(series):
96
+ return series
97
+ return pd.to_datetime(series, errors='coerce', format='mixed')
@@ -0,0 +1,181 @@
1
+ from dataclasses import dataclass, field
2
+ from enum import Enum
3
+ from typing import Optional
4
+
5
+ import numpy as np
6
+
7
+ from customer_retention.core.compat import Series, pd
8
+
9
+
10
+ class ScalingStrategy(str, Enum):
11
+ STANDARD = "standard"
12
+ MINMAX = "minmax"
13
+ ROBUST = "robust"
14
+ MAXABS = "maxabs"
15
+ NONE = "none"
16
+
17
+
18
+ class PowerTransform(str, Enum):
19
+ LOG = "log"
20
+ LOG1P = "log1p"
21
+ SQRT = "sqrt"
22
+ BOXCOX = "boxcox"
23
+ YEOJOHNSON = "yeojohnson"
24
+ NONE = "none"
25
+
26
+
27
+ @dataclass
28
+ class NumericTransformResult:
29
+ series: Series
30
+ original_mean: float
31
+ original_std: float
32
+ original_min: float
33
+ original_max: float
34
+ transformed_mean: float
35
+ transformed_std: float
36
+ transformations_applied: list = field(default_factory=list)
37
+ scaler_params: dict = field(default_factory=dict)
38
+
39
+
40
+ class NumericTransformer:
41
+ def __init__(
42
+ self,
43
+ scaling: ScalingStrategy = ScalingStrategy.NONE,
44
+ power_transform: PowerTransform = PowerTransform.NONE
45
+ ):
46
+ self.scaling = scaling
47
+ self.power_transform = power_transform
48
+ self._mean: Optional[float] = None
49
+ self._std: Optional[float] = None
50
+ self._min: Optional[float] = None
51
+ self._max: Optional[float] = None
52
+ self._median: Optional[float] = None
53
+ self._iqr: Optional[float] = None
54
+ self._max_abs: Optional[float] = None
55
+ self._is_fitted = False
56
+
57
+ def fit(self, series: Series) -> "NumericTransformer":
58
+ clean = series.dropna()
59
+ transformed = self._apply_power_transform(clean)
60
+
61
+ self._mean = float(transformed.mean())
62
+ self._std = float(transformed.std(ddof=0))
63
+ self._min = float(transformed.min())
64
+ self._max = float(transformed.max())
65
+ self._median = float(transformed.median())
66
+ q1, q3 = transformed.quantile(0.25), transformed.quantile(0.75)
67
+ self._iqr = float(q3 - q1)
68
+ self._max_abs = float(transformed.abs().max())
69
+ self._is_fitted = True
70
+ return self
71
+
72
+ def transform(self, series: Series) -> NumericTransformResult:
73
+ if not self._is_fitted:
74
+ raise ValueError("Transformer not fitted. Call fit() or fit_transform() first.")
75
+ return self._apply_transformations(series)
76
+
77
+ def fit_transform(self, series: Series) -> NumericTransformResult:
78
+ self.fit(series)
79
+ return self._apply_transformations(series)
80
+
81
+ def inverse_transform(self, series: Series) -> Series:
82
+ result = series.copy()
83
+
84
+ if self.scaling == ScalingStrategy.STANDARD:
85
+ result = result * self._std + self._mean
86
+ elif self.scaling == ScalingStrategy.MINMAX:
87
+ result = result * (self._max - self._min) + self._min
88
+ elif self.scaling == ScalingStrategy.ROBUST:
89
+ result = result * self._iqr + self._median
90
+ elif self.scaling == ScalingStrategy.MAXABS:
91
+ result = result * self._max_abs
92
+
93
+ if self.power_transform == PowerTransform.LOG:
94
+ result = np.exp(result)
95
+ elif self.power_transform == PowerTransform.LOG1P:
96
+ result = np.expm1(result)
97
+ elif self.power_transform == PowerTransform.SQRT:
98
+ result = result ** 2
99
+
100
+ return result
101
+
102
+ def _apply_power_transform(self, series: Series) -> Series:
103
+ if self.power_transform == PowerTransform.NONE:
104
+ return series
105
+
106
+ if self.power_transform == PowerTransform.LOG:
107
+ if (series <= 0).any():
108
+ raise ValueError("Log transform requires positive values")
109
+ return np.log(series)
110
+
111
+ if self.power_transform == PowerTransform.LOG1P:
112
+ if (series < 0).any():
113
+ raise ValueError("Log1p transform requires non-negative values")
114
+ return np.log1p(series)
115
+
116
+ if self.power_transform == PowerTransform.SQRT:
117
+ if (series < 0).any():
118
+ raise ValueError("Sqrt transform requires non-negative values")
119
+ return np.sqrt(series)
120
+
121
+ return series
122
+
123
+ def _apply_scaling(self, series: Series) -> Series:
124
+ if self.scaling == ScalingStrategy.NONE:
125
+ return series
126
+
127
+ if self.scaling == ScalingStrategy.STANDARD:
128
+ if self._std == 0:
129
+ return series - self._mean
130
+ return (series - self._mean) / self._std
131
+
132
+ if self.scaling == ScalingStrategy.MINMAX:
133
+ range_val = self._max - self._min
134
+ if range_val == 0:
135
+ return pd.Series(0.0, index=series.index)
136
+ return (series - self._min) / range_val
137
+
138
+ if self.scaling == ScalingStrategy.ROBUST:
139
+ if self._iqr == 0:
140
+ return series - self._median
141
+ return (series - self._median) / self._iqr
142
+
143
+ if self.scaling == ScalingStrategy.MAXABS:
144
+ if self._max_abs == 0:
145
+ return series
146
+ return series / self._max_abs
147
+
148
+ return series
149
+
150
+ def _apply_transformations(self, series: Series) -> NumericTransformResult:
151
+ original_clean = series.dropna()
152
+ original_mean = float(original_clean.mean())
153
+ original_std = float(original_clean.std(ddof=0))
154
+ original_min = float(original_clean.min())
155
+ original_max = float(original_clean.max())
156
+
157
+ transformations = []
158
+
159
+ mask = series.notna()
160
+ result = series.copy()
161
+
162
+ if self.power_transform != PowerTransform.NONE:
163
+ result.loc[mask] = self._apply_power_transform(series[mask])
164
+ transformations.append(self.power_transform)
165
+
166
+ if self.scaling != ScalingStrategy.NONE:
167
+ result.loc[mask] = self._apply_scaling(result[mask])
168
+ transformations.append(self.scaling)
169
+
170
+ result_clean = result.dropna()
171
+ transformed_mean = float(result_clean.mean()) if len(result_clean) > 0 else 0.0
172
+ transformed_std = float(result_clean.std(ddof=0)) if len(result_clean) > 0 else 0.0
173
+
174
+ return NumericTransformResult(
175
+ series=result,
176
+ original_mean=original_mean, original_std=original_std,
177
+ original_min=original_min, original_max=original_max,
178
+ transformed_mean=transformed_mean, transformed_std=transformed_std,
179
+ transformations_applied=transformations,
180
+ scaler_params={"mean": self._mean, "std": self._std, "min": self._min, "max": self._max}
181
+ )
@@ -0,0 +1,257 @@
1
+ from dataclasses import dataclass, field
2
+ from datetime import datetime
3
+ from typing import Optional
4
+
5
+ import numpy as np
6
+
7
+ from customer_retention.core.compat import DataFrame, pd
8
+ from customer_retention.core.config import ColumnType
9
+ from customer_retention.stages.cleaning import MissingValueHandler, OutlierHandler, OutlierTreatmentStrategy
10
+
11
+ from .binary_handler import BinaryHandler
12
+ from .categorical_encoder import CategoricalEncoder, EncodingStrategy
13
+ from .datetime_transformer import DatetimeTransformer
14
+ from .numeric_transformer import NumericTransformer, ScalingStrategy
15
+
16
+
17
+ @dataclass
18
+ class TransformationManifest:
19
+ timestamp: str = ""
20
+ version: str = "1.0"
21
+ input_rows: int = 0
22
+ input_columns: int = 0
23
+ output_rows: int = 0
24
+ output_columns: int = 0
25
+ columns_dropped: dict = field(default_factory=dict)
26
+ missing_value_handling: dict = field(default_factory=dict)
27
+ outlier_treatment: dict = field(default_factory=dict)
28
+ numeric_transformations: dict = field(default_factory=dict)
29
+ categorical_encodings: dict = field(default_factory=dict)
30
+ datetime_transformations: dict = field(default_factory=dict)
31
+ binary_mappings: dict = field(default_factory=dict)
32
+ column_mapping: dict = field(default_factory=dict)
33
+ final_schema: dict = field(default_factory=dict)
34
+ execution_order: list = field(default_factory=list)
35
+
36
+
37
+ @dataclass
38
+ class PipelineResult:
39
+ df: DataFrame
40
+ manifest: TransformationManifest
41
+ validation_passed: bool = True
42
+ validation_errors: list = field(default_factory=list)
43
+
44
+
45
+ class TransformationPipeline:
46
+ EXECUTION_ORDER = [
47
+ "drop_columns", "handle_missing", "treat_outliers",
48
+ "transform_datetime", "transform_numeric",
49
+ "encode_categorical", "standardize_binary", "validate"
50
+ ]
51
+
52
+ def __init__(
53
+ self,
54
+ column_types: Optional[dict[str, ColumnType]] = None,
55
+ auto_from_profile: bool = True,
56
+ column_configs: Optional[dict] = None,
57
+ drop_constant_columns: bool = False,
58
+ drop_high_missing: bool = True,
59
+ create_missing_indicators: bool = False,
60
+ validate_output: bool = True
61
+ ):
62
+ self.column_types = column_types or {}
63
+ self.auto_from_profile = auto_from_profile
64
+ self.column_configs = column_configs or {}
65
+ self.drop_constant_columns = drop_constant_columns
66
+ self.drop_high_missing = drop_high_missing
67
+ self.create_missing_indicators = create_missing_indicators
68
+ self.validate_output = validate_output
69
+
70
+ self._missing_handlers: dict[str, MissingValueHandler] = {}
71
+ self._outlier_handlers: dict[str, OutlierHandler] = {}
72
+ self._numeric_transformers: dict[str, NumericTransformer] = {}
73
+ self._categorical_encoders: dict[str, CategoricalEncoder] = {}
74
+ self._datetime_transformers: dict[str, DatetimeTransformer] = {}
75
+ self._binary_handlers: dict[str, BinaryHandler] = {}
76
+ self._columns_to_drop: list[str] = []
77
+ self._is_fitted = False
78
+
79
+ def fit(self, df: DataFrame) -> "TransformationPipeline":
80
+ self._identify_columns_to_drop(df)
81
+ working_df = df.drop(columns=self._columns_to_drop, errors='ignore')
82
+
83
+ for col, col_type in self.column_types.items():
84
+ if col in self._columns_to_drop or col not in working_df.columns:
85
+ continue
86
+ self._fit_column(working_df, col, col_type)
87
+
88
+ self._is_fitted = True
89
+ return self
90
+
91
+ def transform(self, df: DataFrame) -> PipelineResult:
92
+ if not self._is_fitted:
93
+ raise ValueError("Pipeline not fitted. Call fit() or fit_transform() first.")
94
+ return self._apply_transformations(df)
95
+
96
+ def fit_transform(self, df: DataFrame) -> PipelineResult:
97
+ self.fit(df)
98
+ return self._apply_transformations(df)
99
+
100
+ def _identify_columns_to_drop(self, df: DataFrame):
101
+ self._columns_to_drop = []
102
+ for col, col_type in self.column_types.items():
103
+ if col not in df.columns:
104
+ continue
105
+ if col_type == ColumnType.IDENTIFIER:
106
+ self._columns_to_drop.append(col)
107
+ if self.drop_high_missing and df[col].isna().mean() > 0.95:
108
+ self._columns_to_drop.append(col)
109
+ if self.drop_constant_columns and df[col].nunique() <= 1:
110
+ self._columns_to_drop.append(col)
111
+
112
+ def _fit_column(self, df: DataFrame, col: str, col_type: ColumnType):
113
+ if col_type == ColumnType.TARGET:
114
+ return
115
+
116
+ series = df[col]
117
+ config = self.column_configs.get(col, {})
118
+
119
+ if series.isna().any():
120
+ handler = MissingValueHandler.from_column_type(col_type)
121
+ if "missing_strategy" in config:
122
+ from customer_retention.stages.cleaning import ImputationStrategy
123
+ handler.strategy = ImputationStrategy(config["missing_strategy"])
124
+ handler.fit(series)
125
+ self._missing_handlers[col] = handler
126
+
127
+ if col_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:
128
+ self._outlier_handlers[col] = OutlierHandler(
129
+ treatment_strategy=OutlierTreatmentStrategy.CAP_IQR
130
+ )
131
+ self._outlier_handlers[col].fit(series.dropna())
132
+
133
+ # Fit numeric transformer on CAPPED data to ensure proper scaling
134
+ outlier_result = self._outlier_handlers[col].transform(series.dropna())
135
+ self._numeric_transformers[col] = NumericTransformer(scaling=ScalingStrategy.STANDARD)
136
+ self._numeric_transformers[col].fit(outlier_result.series)
137
+
138
+ elif col_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]:
139
+ self._categorical_encoders[col] = CategoricalEncoder(
140
+ strategy=EncodingStrategy.ONE_HOT, drop_first=True
141
+ )
142
+ self._categorical_encoders[col].fit(series)
143
+
144
+ elif col_type == ColumnType.DATETIME:
145
+ self._datetime_transformers[col] = DatetimeTransformer()
146
+ self._datetime_transformers[col].fit(series)
147
+
148
+ elif col_type == ColumnType.BINARY:
149
+ self._binary_handlers[col] = BinaryHandler()
150
+ self._binary_handlers[col].fit(series)
151
+
152
+ def _apply_transformations(self, df: DataFrame) -> PipelineResult:
153
+ manifest = TransformationManifest(
154
+ timestamp=datetime.now().isoformat(),
155
+ input_rows=len(df), input_columns=len(df.columns),
156
+ execution_order=self.EXECUTION_ORDER
157
+ )
158
+
159
+ working_df = df.copy()
160
+
161
+ manifest.columns_dropped = {col: "identifier/high_missing/constant" for col in self._columns_to_drop}
162
+ working_df = working_df.drop(columns=self._columns_to_drop, errors='ignore')
163
+
164
+ for col, handler in self._missing_handlers.items():
165
+ if col in working_df.columns:
166
+ result = handler.transform(working_df[col])
167
+ working_df[col] = result.series
168
+ manifest.missing_value_handling[col] = {
169
+ "strategy": str(result.strategy_used), "values_imputed": result.values_imputed
170
+ }
171
+
172
+ for col, handler in self._outlier_handlers.items():
173
+ if col in working_df.columns:
174
+ result = handler.transform(working_df[col])
175
+ working_df[col] = result.series
176
+ manifest.outlier_treatment[col] = {
177
+ "method": str(result.method_used),
178
+ "outliers_detected": result.outliers_detected
179
+ }
180
+
181
+ datetime_cols_to_drop = []
182
+ datetime_extracted_cols = []
183
+ for col, transformer in self._datetime_transformers.items():
184
+ if col in working_df.columns:
185
+ result = transformer.transform(working_df[col])
186
+ for new_col in result.df.columns:
187
+ working_df[new_col] = result.df[new_col].values
188
+ datetime_extracted_cols.append(new_col)
189
+ datetime_cols_to_drop.append(col)
190
+ manifest.datetime_transformations[col] = {
191
+ "extracted": result.extracted_features
192
+ }
193
+ manifest.column_mapping[col] = list(result.df.columns)
194
+ working_df = working_df.drop(columns=datetime_cols_to_drop, errors='ignore')
195
+
196
+ # Handle NaN values from invalid datetime parsing (e.g., '1/0/00')
197
+ for col in datetime_extracted_cols:
198
+ if col in working_df.columns and working_df[col].isna().any():
199
+ # Fill with median for extracted datetime features
200
+ median_val = working_df[col].median()
201
+ if pd.notna(median_val):
202
+ working_df[col] = working_df[col].fillna(median_val)
203
+
204
+ for col, transformer in self._numeric_transformers.items():
205
+ if col in working_df.columns:
206
+ result = transformer.transform(working_df[col])
207
+ working_df[col] = result.series
208
+ manifest.numeric_transformations[col] = {
209
+ "transformations": [str(t) for t in result.transformations_applied]
210
+ }
211
+
212
+ categorical_cols_to_drop = []
213
+ for col, encoder in self._categorical_encoders.items():
214
+ if col in working_df.columns:
215
+ result = encoder.transform(working_df[col])
216
+ if result.df is not None:
217
+ for new_col in result.df.columns:
218
+ working_df[new_col] = result.df[new_col].values
219
+ categorical_cols_to_drop.append(col)
220
+ manifest.column_mapping[col] = list(result.df.columns)
221
+ manifest.categorical_encodings[col] = {
222
+ "strategy": str(result.strategy), "columns_created": result.columns_created
223
+ }
224
+ working_df = working_df.drop(columns=categorical_cols_to_drop, errors='ignore')
225
+
226
+ for col, handler in self._binary_handlers.items():
227
+ if col in working_df.columns:
228
+ result = handler.transform(working_df[col])
229
+ working_df[col] = result.series
230
+ manifest.binary_mappings[col] = {"mapping": result.mapping}
231
+
232
+ validation_passed, validation_errors = self._validate_output(working_df)
233
+
234
+ manifest.output_rows = len(working_df)
235
+ manifest.output_columns = len(working_df.columns)
236
+ manifest.final_schema = {col: str(working_df[col].dtype) for col in working_df.columns}
237
+
238
+ return PipelineResult(
239
+ df=working_df, manifest=manifest,
240
+ validation_passed=validation_passed, validation_errors=validation_errors
241
+ )
242
+
243
+ def _validate_output(self, df: DataFrame) -> tuple[bool, list[str]]:
244
+ errors = []
245
+
246
+ target_cols = [c for c, t in self.column_types.items() if t == ColumnType.TARGET and c in df.columns]
247
+ non_target = df.drop(columns=target_cols, errors='ignore')
248
+
249
+ if non_target.isna().any().any():
250
+ null_cols = non_target.columns[non_target.isna().any()].tolist()
251
+ errors.append(f"TQ001: Null values in columns: {null_cols}")
252
+
253
+ numeric_df = non_target.select_dtypes(include=[np.number])
254
+ if np.isinf(numeric_df.values).any():
255
+ errors.append("TQ002: Infinite values found")
256
+
257
+ return len(errors) == 0, errors
@@ -0,0 +1,60 @@
1
+ from customer_retention.core.components.enums import Severity
2
+
3
+ from .adversarial_scoring_validator import (
4
+ AdversarialScoringValidator,
5
+ AdversarialValidationResult,
6
+ DriftSeverity,
7
+ FeatureDrift,
8
+ )
9
+ from .business_sense_gate import BusinessCheck, BusinessSenseGate, BusinessSenseResult
10
+ from .data_quality_gate import DataQualityGate
11
+ from .data_validators import DataValidator, DateLogicResult, DuplicateResult, RangeValidationResult
12
+ from .feature_quality_gate import FeatureQualityGate
13
+ from .gates import GateResult, ValidationGate, ValidationIssue
14
+ from .leakage_gate import LeakageCheckResult, LeakageGate
15
+ from .model_validity_gate import ModelValidityGate, ModelValidityResult
16
+ from .pipeline_validation_runner import (
17
+ PipelineValidationConfig,
18
+ PipelineValidationRunner,
19
+ compare_pipeline_outputs,
20
+ run_pipeline_validation,
21
+ validate_feature_transformation,
22
+ )
23
+ from .quality_scorer import QualityLevel, QualityScorer, QualityScoreResult
24
+ from .rule_generator import RuleGenerator
25
+ from .scoring_pipeline_validator import (
26
+ FeatureMismatch,
27
+ MismatchSeverity,
28
+ PredictionMismatch,
29
+ ScoringPipelineValidator,
30
+ ValidationConfig,
31
+ ValidationReport,
32
+ )
33
+ from .timeseries_detector import (
34
+ DatasetType,
35
+ TimeSeriesCharacteristics,
36
+ TimeSeriesDetector,
37
+ TimeSeriesFrequency,
38
+ TimeSeriesValidationResult,
39
+ TimeSeriesValidator,
40
+ )
41
+
42
+ __all__ = [
43
+ "Severity", "ValidationIssue", "GateResult", "ValidationGate",
44
+ "DataQualityGate", "FeatureQualityGate",
45
+ "LeakageGate", "LeakageCheckResult",
46
+ "ModelValidityGate", "ModelValidityResult",
47
+ "BusinessSenseGate", "BusinessSenseResult", "BusinessCheck",
48
+ "DataValidator", "DuplicateResult", "DateLogicResult", "RangeValidationResult",
49
+ "QualityScorer", "QualityScoreResult", "QualityLevel",
50
+ "RuleGenerator",
51
+ "ScoringPipelineValidator", "ValidationReport", "ValidationConfig",
52
+ "FeatureMismatch", "PredictionMismatch", "MismatchSeverity",
53
+ "PipelineValidationRunner", "PipelineValidationConfig",
54
+ "run_pipeline_validation", "validate_feature_transformation", "compare_pipeline_outputs",
55
+ "TimeSeriesDetector", "TimeSeriesValidator",
56
+ "TimeSeriesCharacteristics", "TimeSeriesValidationResult",
57
+ "DatasetType", "TimeSeriesFrequency",
58
+ "AdversarialScoringValidator", "AdversarialValidationResult",
59
+ "FeatureDrift", "DriftSeverity",
60
+ ]