churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,242 @@
1
+ from dataclasses import dataclass, field
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from typing import Dict, List, Optional
5
+
6
+ import numpy as np
7
+ from sklearn.calibration import calibration_curve
8
+ from sklearn.metrics import auc, brier_score_loss, precision_recall_curve, precision_score, recall_score, roc_auc_score
9
+
10
+ from customer_retention.core.compat import Series
11
+
12
+
13
+ class PerformanceStatus(Enum):
14
+ OK = "OK"
15
+ WARNING = "WARNING"
16
+ CRITICAL = "CRITICAL"
17
+
18
+
19
+ @dataclass
20
+ class MonitoringConfig:
21
+ pr_auc_warning_drop: float = 0.10
22
+ pr_auc_critical_drop: float = 0.15
23
+ roc_auc_warning_drop: float = 0.08
24
+ roc_auc_critical_drop: float = 0.10
25
+ precision_warning_drop: float = 0.20
26
+ recall_warning_drop: float = 0.20
27
+ brier_warning_increase: float = 0.05
28
+ brier_critical_increase: float = 0.10
29
+
30
+
31
+ @dataclass
32
+ class PerformanceResult:
33
+ current_metrics: Dict[str, float]
34
+ baseline_metrics: Dict[str, float]
35
+ comparison: Dict[str, float]
36
+ status: PerformanceStatus
37
+ labels_available: int
38
+ proxy_metrics: Optional[Dict] = None
39
+ monitoring_date: datetime = field(default_factory=datetime.now)
40
+
41
+
42
+ @dataclass
43
+ class CalibrationCurve:
44
+ bin_means: List[float]
45
+ actual_rates: List[float]
46
+ counts: List[int]
47
+
48
+
49
+ @dataclass
50
+ class DistributionAnalysis:
51
+ mean: float
52
+ std: float
53
+ min_val: float
54
+ max_val: float
55
+ percentiles: Dict[str, float]
56
+
57
+
58
+ @dataclass
59
+ class ProportionAnalysis:
60
+ proportions: Dict[str, float]
61
+
62
+
63
+ @dataclass
64
+ class DistributionComparison:
65
+ distribution_shift_detected: bool
66
+ ks_statistic: float
67
+ mean_diff: float
68
+
69
+
70
+ @dataclass
71
+ class TrendReport:
72
+ pr_auc_trend: List[float]
73
+ dates: List[datetime]
74
+ trend_direction: str
75
+
76
+
77
+ class PerformanceMonitor:
78
+ def __init__(self, baseline_metrics: Dict[str, float],
79
+ config: Optional[MonitoringConfig] = None):
80
+ self.baseline_metrics = baseline_metrics
81
+ self.config = config or MonitoringConfig()
82
+ self._history: List[PerformanceResult] = []
83
+
84
+ def evaluate(self, y_true: Series, y_prob: Series,
85
+ y_pred: Optional[Series] = None) -> PerformanceResult:
86
+ y_true_clean = y_true.dropna()
87
+ y_prob_clean = y_prob[y_true.notna()]
88
+ current_metrics = {}
89
+ precision, recall, _ = precision_recall_curve(y_true_clean, y_prob_clean)
90
+ current_metrics["pr_auc"] = auc(recall, precision)
91
+ current_metrics["roc_auc"] = roc_auc_score(y_true_clean, y_prob_clean)
92
+ current_metrics["brier_score"] = brier_score_loss(y_true_clean, y_prob_clean)
93
+ if y_pred is not None:
94
+ y_pred_clean = y_pred[y_true.notna()]
95
+ current_metrics["precision"] = precision_score(y_true_clean, y_pred_clean)
96
+ current_metrics["recall"] = recall_score(y_true_clean, y_pred_clean)
97
+ else:
98
+ y_pred_binary = (y_prob_clean >= 0.5).astype(int)
99
+ current_metrics["precision"] = precision_score(y_true_clean, y_pred_binary)
100
+ current_metrics["recall"] = recall_score(y_true_clean, y_pred_binary)
101
+ comparison = self._compare_to_baseline(current_metrics)
102
+ status = self._determine_status(current_metrics)
103
+ result = PerformanceResult(
104
+ current_metrics=current_metrics,
105
+ baseline_metrics=self.baseline_metrics,
106
+ comparison=comparison,
107
+ status=status,
108
+ labels_available=len(y_true_clean)
109
+ )
110
+ self._history.append(result)
111
+ return result
112
+
113
+ def evaluate_without_labels(self, y_prob: Series) -> PerformanceResult:
114
+ proxy = ProxyMetrics()
115
+ dist_analysis = proxy.analyze_prediction_distribution(y_prob)
116
+ proxy_metrics = {
117
+ "mean_prediction": dist_analysis.mean,
118
+ "std_prediction": dist_analysis.std,
119
+ "percentile_25": dist_analysis.percentiles["25"],
120
+ "percentile_50": dist_analysis.percentiles["50"],
121
+ "percentile_75": dist_analysis.percentiles["75"]
122
+ }
123
+ return PerformanceResult(
124
+ current_metrics={},
125
+ baseline_metrics=self.baseline_metrics,
126
+ comparison={},
127
+ status=PerformanceStatus.OK,
128
+ labels_available=0,
129
+ proxy_metrics=proxy_metrics
130
+ )
131
+
132
+ def compare_metrics(self, metrics: Dict[str, float]) -> PerformanceResult:
133
+ comparison = self._compare_to_baseline(metrics)
134
+ status = self._determine_status(metrics)
135
+ return PerformanceResult(
136
+ current_metrics=metrics,
137
+ baseline_metrics=self.baseline_metrics,
138
+ comparison=comparison,
139
+ status=status,
140
+ labels_available=0
141
+ )
142
+
143
+ def _compare_to_baseline(self, current: Dict[str, float]) -> Dict[str, float]:
144
+ comparison = {}
145
+ for metric, value in current.items():
146
+ if metric in self.baseline_metrics:
147
+ baseline = self.baseline_metrics[metric]
148
+ if baseline != 0:
149
+ change_pct = (value - baseline) / baseline * 100
150
+ else:
151
+ change_pct = 0
152
+ comparison[f"{metric}_change_pct"] = change_pct
153
+ comparison[f"{metric}_diff"] = value - baseline
154
+ return comparison
155
+
156
+ def _determine_status(self, current: Dict[str, float]) -> PerformanceStatus:
157
+ if "pr_auc" in current and "pr_auc" in self.baseline_metrics:
158
+ drop = self.baseline_metrics["pr_auc"] - current["pr_auc"]
159
+ if drop >= self.config.pr_auc_critical_drop:
160
+ return PerformanceStatus.CRITICAL
161
+ elif drop >= self.config.pr_auc_warning_drop:
162
+ return PerformanceStatus.WARNING
163
+ if "roc_auc" in current and "roc_auc" in self.baseline_metrics:
164
+ drop = self.baseline_metrics["roc_auc"] - current["roc_auc"]
165
+ if drop >= self.config.roc_auc_critical_drop:
166
+ return PerformanceStatus.CRITICAL
167
+ elif drop >= self.config.roc_auc_warning_drop:
168
+ return PerformanceStatus.WARNING
169
+ if "brier_score" in current and "brier_score" in self.baseline_metrics:
170
+ increase = current["brier_score"] - self.baseline_metrics["brier_score"]
171
+ if increase >= self.config.brier_critical_increase:
172
+ return PerformanceStatus.CRITICAL
173
+ elif increase >= self.config.brier_warning_increase:
174
+ return PerformanceStatus.WARNING
175
+ return PerformanceStatus.OK
176
+
177
+ def get_history(self) -> List[PerformanceResult]:
178
+ return self._history.copy()
179
+
180
+ def get_trend_report(self) -> Dict:
181
+ if len(self._history) < 2:
182
+ return {"pr_auc_trend": [], "dates": [], "trend_direction": "insufficient_data"}
183
+ pr_auc_values = [h.current_metrics.get("pr_auc", 0) for h in self._history]
184
+ dates = [h.monitoring_date for h in self._history]
185
+ if pr_auc_values[-1] > pr_auc_values[0]:
186
+ direction = "improving"
187
+ elif pr_auc_values[-1] < pr_auc_values[0]:
188
+ direction = "declining"
189
+ else:
190
+ direction = "stable"
191
+ return {
192
+ "pr_auc_trend": pr_auc_values,
193
+ "dates": dates,
194
+ "trend_direction": direction
195
+ }
196
+
197
+ def get_calibration_curve(self, y_true: Series, y_prob: Series,
198
+ n_bins: int = 10) -> CalibrationCurve:
199
+ prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins)
200
+ bin_counts = []
201
+ bins = np.linspace(0, 1, n_bins + 1)
202
+ for i in range(n_bins):
203
+ mask = (y_prob >= bins[i]) & (y_prob < bins[i + 1])
204
+ bin_counts.append(mask.sum())
205
+ return CalibrationCurve(
206
+ bin_means=prob_pred.tolist(),
207
+ actual_rates=prob_true.tolist(),
208
+ counts=bin_counts
209
+ )
210
+
211
+
212
+ class ProxyMetrics:
213
+ def analyze_prediction_distribution(self, y_prob: Series) -> DistributionAnalysis:
214
+ return DistributionAnalysis(
215
+ mean=y_prob.mean(),
216
+ std=y_prob.std(),
217
+ min_val=y_prob.min(),
218
+ max_val=y_prob.max(),
219
+ percentiles={
220
+ "10": y_prob.quantile(0.10),
221
+ "25": y_prob.quantile(0.25),
222
+ "50": y_prob.quantile(0.50),
223
+ "75": y_prob.quantile(0.75),
224
+ "90": y_prob.quantile(0.90)
225
+ }
226
+ )
227
+
228
+ def analyze_segment_proportions(self, segments: Series) -> ProportionAnalysis:
229
+ proportions = segments.value_counts(normalize=True).to_dict()
230
+ return ProportionAnalysis(proportions=proportions)
231
+
232
+ def compare_distributions(self, reference: Series,
233
+ current: Series) -> DistributionComparison:
234
+ from scipy import stats
235
+ ks_stat, _ = stats.ks_2samp(reference, current)
236
+ mean_diff = abs(current.mean() - reference.mean())
237
+ shift_detected = ks_stat > 0.1 or mean_diff > reference.std() * 0.5
238
+ return DistributionComparison(
239
+ distribution_shift_detected=shift_detected,
240
+ ks_statistic=ks_stat,
241
+ mean_diff=mean_diff
242
+ )
@@ -0,0 +1,5 @@
1
+ from .transformer_manager import TransformerBundle, TransformerManager, TransformerManifest
2
+
3
+ __all__ = [
4
+ "TransformerManager", "TransformerBundle", "TransformerManifest",
5
+ ]
@@ -0,0 +1,284 @@
1
+ """Transformer persistence and consistent application for training/scoring.
2
+
3
+ Ensures that the same transformations (scaling, encoding) applied during training
4
+ are replicated exactly during scoring to prevent data leakage and prediction errors.
5
+ """
6
+ import json
7
+ import tempfile
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from typing import Any, Dict, List, Optional, Union
11
+
12
+ import joblib
13
+ import numpy as np
14
+ import pandas as pd
15
+ from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, StandardScaler
16
+
17
+
18
+ @dataclass
19
+ class TransformerManifest:
20
+ numeric_columns: List[str] = field(default_factory=list)
21
+ categorical_columns: List[str] = field(default_factory=list)
22
+ scaler_type: Optional[str] = None
23
+ encoder_type: str = "label"
24
+ feature_order: List[str] = field(default_factory=list)
25
+ created_at: Optional[str] = None
26
+
27
+ def to_dict(self) -> Dict[str, Any]:
28
+ return {"numeric_columns": self.numeric_columns, "categorical_columns": self.categorical_columns,
29
+ "scaler_type": self.scaler_type, "encoder_type": self.encoder_type,
30
+ "feature_order": self.feature_order, "created_at": self.created_at}
31
+
32
+ @classmethod
33
+ def from_dict(cls, data: Dict[str, Any]) -> "TransformerManifest":
34
+ return cls(numeric_columns=data.get("numeric_columns", []),
35
+ categorical_columns=data.get("categorical_columns", []),
36
+ scaler_type=data.get("scaler_type"), encoder_type=data.get("encoder_type", "label"),
37
+ feature_order=data.get("feature_order", []), created_at=data.get("created_at"))
38
+
39
+
40
+ @dataclass
41
+ class TransformerBundle:
42
+ scaler: Optional[Any] = None
43
+ encoders: Dict[str, LabelEncoder] = field(default_factory=dict)
44
+ manifest: TransformerManifest = field(default_factory=TransformerManifest)
45
+
46
+ def to_dict(self) -> Dict[str, Any]:
47
+ return {"numeric_scaler": self.scaler, "label_encoders": self.encoders,
48
+ "manifest": self.manifest.to_dict()}
49
+
50
+ @classmethod
51
+ def from_dict(cls, data: Dict[str, Any]) -> "TransformerBundle":
52
+ return cls(scaler=data.get("numeric_scaler"), encoders=data.get("label_encoders", {}),
53
+ manifest=TransformerManifest.from_dict(data.get("manifest", {})))
54
+
55
+
56
+ class TransformerManager:
57
+ """Manages transformer persistence and application for training/scoring consistency.
58
+
59
+ Usage for Training:
60
+ manager = TransformerManager()
61
+ df_transformed = manager.fit_transform(df, numeric_cols, categorical_cols)
62
+ manager.save("./output/transformers/transformers.joblib")
63
+ manager.log_to_mlflow(run_id)
64
+
65
+ Usage for Scoring:
66
+ manager = TransformerManager.load_from_mlflow(run_id)
67
+ # OR: manager = TransformerManager.load("./output/transformers/transformers.joblib")
68
+ df_transformed = manager.transform(df)
69
+ """
70
+
71
+ def __init__(self, scaler_type: str = "standard"):
72
+ """Initialize transformer manager.
73
+
74
+ Args:
75
+ scaler_type: Type of scaler to use ("standard", "robust", "minmax")
76
+ """
77
+ self._scaler_type = scaler_type
78
+ self._bundle = TransformerBundle()
79
+ self._is_fitted = False
80
+
81
+ @property
82
+ def is_fitted(self) -> bool:
83
+ return self._is_fitted
84
+
85
+ @property
86
+ def manifest(self) -> TransformerManifest:
87
+ return self._bundle.manifest
88
+
89
+ def fit_transform(self, df: pd.DataFrame,
90
+ numeric_columns: Optional[List[str]] = None,
91
+ categorical_columns: Optional[List[str]] = None,
92
+ exclude_columns: Optional[List[str]] = None) -> pd.DataFrame:
93
+ """Fit transformers on training data and transform it.
94
+
95
+ Args:
96
+ df: Training DataFrame
97
+ numeric_columns: Columns to scale (auto-detected if None)
98
+ categorical_columns: Columns to encode (auto-detected if None)
99
+ exclude_columns: Columns to exclude from transformation
100
+
101
+ Returns:
102
+ Transformed DataFrame
103
+ """
104
+ from datetime import datetime
105
+
106
+ df = df.copy()
107
+ exclude = set(exclude_columns or [])
108
+
109
+ numeric_columns = self._resolve_numeric_columns(df, numeric_columns, exclude)
110
+ categorical_columns = self._resolve_categorical_columns(df, categorical_columns, exclude)
111
+
112
+ self._fit_numeric_scaler(df, numeric_columns)
113
+ self._fit_categorical_encoders(df, categorical_columns)
114
+ self._build_manifest(df, numeric_columns, categorical_columns, exclude, datetime.now().isoformat())
115
+ self._is_fitted = True
116
+
117
+ return df
118
+
119
+ def _resolve_numeric_columns(self, df: pd.DataFrame, columns: Optional[List[str]], exclude: set) -> List[str]:
120
+ if columns is None:
121
+ columns = [c for c in df.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns
122
+ if c not in exclude]
123
+ return [c for c in columns if c in df.columns and c not in exclude]
124
+
125
+ def _resolve_categorical_columns(self, df: pd.DataFrame, columns: Optional[List[str]], exclude: set) -> List[str]:
126
+ if columns is None:
127
+ columns = [c for c in df.select_dtypes(include=["object", "category"]).columns if c not in exclude]
128
+ return [c for c in columns if c in df.columns and c not in exclude]
129
+
130
+ def _fit_numeric_scaler(self, df: pd.DataFrame, numeric_columns: List[str]) -> None:
131
+ if numeric_columns:
132
+ scaler = self._create_scaler()
133
+ df[numeric_columns] = scaler.fit_transform(df[numeric_columns].fillna(0))
134
+ self._bundle.scaler = scaler
135
+
136
+ def _fit_categorical_encoders(self, df: pd.DataFrame, categorical_columns: List[str]) -> None:
137
+ encoders = {}
138
+ for col in categorical_columns:
139
+ le = LabelEncoder()
140
+ df[col] = le.fit_transform(df[col].astype(str))
141
+ encoders[col] = le
142
+ self._bundle.encoders = encoders
143
+
144
+ def _build_manifest(self, df: pd.DataFrame, numeric_columns: List[str],
145
+ categorical_columns: List[str], exclude: set, created_at: str) -> None:
146
+ feature_order = [c for c in df.columns if c not in exclude]
147
+ self._bundle.manifest = TransformerManifest(
148
+ numeric_columns=numeric_columns, categorical_columns=categorical_columns,
149
+ scaler_type=self._scaler_type, encoder_type="label",
150
+ feature_order=feature_order, created_at=created_at)
151
+
152
+ def transform(self, df: pd.DataFrame,
153
+ exclude_columns: Optional[List[str]] = None) -> pd.DataFrame:
154
+ """Apply fitted transformers to new data (for scoring).
155
+
156
+ Args:
157
+ df: DataFrame to transform
158
+ exclude_columns: Columns to exclude (e.g., entity_key, target)
159
+
160
+ Returns:
161
+ Transformed DataFrame with same feature order as training
162
+ """
163
+ if not self._is_fitted:
164
+ raise ValueError("TransformerManager not fitted. Call fit_transform() or load().")
165
+
166
+ df = df.copy()
167
+ exclude = set(exclude_columns or [])
168
+ manifest = self._bundle.manifest
169
+
170
+ self._apply_numeric_scaling(df, manifest)
171
+ self._apply_categorical_encoding(df, manifest)
172
+
173
+ feature_cols = [c for c in manifest.feature_order if c not in exclude and c in df.columns]
174
+ return df[feature_cols]
175
+
176
+ def _apply_numeric_scaling(self, df: pd.DataFrame, manifest: TransformerManifest) -> None:
177
+ if self._bundle.scaler is None or not manifest.numeric_columns:
178
+ return
179
+ present_cols = [c for c in manifest.numeric_columns if c in df.columns]
180
+ missing_cols = [c for c in manifest.numeric_columns if c not in df.columns]
181
+
182
+ if present_cols:
183
+ col_indices = {col: i for i, col in enumerate(manifest.numeric_columns)}
184
+ temp_arr = np.zeros((len(df), len(manifest.numeric_columns)))
185
+ for col in present_cols:
186
+ temp_arr[:, col_indices[col]] = df[col].fillna(0).values
187
+ transformed = self._bundle.scaler.transform(temp_arr)
188
+ for col in present_cols:
189
+ df[col] = transformed[:, col_indices[col]]
190
+
191
+ for col in missing_cols:
192
+ df[col] = 0.0
193
+
194
+ def _apply_categorical_encoding(self, df: pd.DataFrame, manifest: TransformerManifest) -> None:
195
+ for col, encoder in self._bundle.encoders.items():
196
+ if col in df.columns:
197
+ df[col] = df[col].astype(str).apply(lambda x, enc=encoder: self._safe_encode(enc, x))
198
+ elif col in manifest.categorical_columns:
199
+ df[col] = self._safe_encode(encoder, "")
200
+
201
+ def _safe_encode(self, encoder: LabelEncoder, value: str) -> int:
202
+ try:
203
+ return int(encoder.transform([value])[0])
204
+ except ValueError:
205
+ return 0
206
+
207
+ def _create_scaler(self):
208
+ scalers = {"standard": StandardScaler, "robust": RobustScaler, "minmax": MinMaxScaler}
209
+ return scalers.get(self._scaler_type, StandardScaler)()
210
+
211
+ def save(self, path: Union[str, Path]) -> None:
212
+ if not self._is_fitted:
213
+ raise ValueError("Cannot save unfitted TransformerManager")
214
+
215
+ path = Path(path)
216
+ path.parent.mkdir(parents=True, exist_ok=True)
217
+ joblib.dump(self._bundle.to_dict(), path)
218
+
219
+ @classmethod
220
+ def load(cls, path: Union[str, Path]) -> "TransformerManager":
221
+ data = joblib.load(path)
222
+ manager = cls()
223
+ manager._bundle = TransformerBundle.from_dict(data)
224
+ manager._is_fitted = True
225
+ manager._scaler_type = manager._bundle.manifest.scaler_type or "standard"
226
+ return manager
227
+
228
+ def log_to_mlflow(self, run_id: Optional[str] = None, artifact_path: str = "transformers") -> None:
229
+
230
+ if not self._is_fitted:
231
+ raise ValueError("Cannot log unfitted TransformerManager")
232
+
233
+ with tempfile.TemporaryDirectory() as tmp_dir:
234
+ bundle_path = Path(tmp_dir) / "transformers.joblib"
235
+ joblib.dump(self._bundle.to_dict(), bundle_path)
236
+
237
+ manifest_path = Path(tmp_dir) / "transformer_manifest.json"
238
+ with open(manifest_path, "w") as f:
239
+ json.dump(self._bundle.manifest.to_dict(), f, indent=2)
240
+
241
+ self._log_artifacts_to_mlflow(run_id, bundle_path, manifest_path, artifact_path)
242
+
243
+ def _log_artifacts_to_mlflow(self, run_id: Optional[str], bundle_path: Path,
244
+ manifest_path: Path, artifact_path: str) -> None:
245
+ import mlflow
246
+ if run_id:
247
+ client = mlflow.tracking.MlflowClient()
248
+ client.log_artifact(run_id, str(bundle_path), artifact_path)
249
+ client.log_artifact(run_id, str(manifest_path), artifact_path)
250
+ else:
251
+ mlflow.log_artifact(str(bundle_path), artifact_path)
252
+ mlflow.log_artifact(str(manifest_path), artifact_path)
253
+
254
+ @classmethod
255
+ def load_from_mlflow(cls, run_id: str, artifact_path: str = "transformers",
256
+ tracking_uri: Optional[str] = None) -> "TransformerManager":
257
+ import mlflow
258
+ if tracking_uri:
259
+ mlflow.set_tracking_uri(tracking_uri)
260
+ client = mlflow.tracking.MlflowClient()
261
+ with tempfile.TemporaryDirectory() as tmp_dir:
262
+ local_path = client.download_artifacts(run_id, f"{artifact_path}/transformers.joblib", tmp_dir)
263
+ return cls.load(local_path)
264
+
265
+ @classmethod
266
+ def load_from_mlflow_by_experiment(cls, experiment_name: str, artifact_path: str = "transformers",
267
+ tracking_uri: Optional[str] = None,
268
+ run_name_filter: Optional[str] = None) -> "TransformerManager":
269
+ import mlflow
270
+ if tracking_uri:
271
+ mlflow.set_tracking_uri(tracking_uri)
272
+
273
+ client = mlflow.tracking.MlflowClient()
274
+ experiment = client.get_experiment_by_name(experiment_name)
275
+ if not experiment:
276
+ raise ValueError(f"Experiment {experiment_name} not found")
277
+
278
+ filter_str = f'tags.mlflow.runName = "{run_name_filter}"' if run_name_filter else ""
279
+ runs = client.search_runs(experiment_ids=[experiment.experiment_id], filter_string=filter_str,
280
+ order_by=["start_time DESC"], max_results=1)
281
+ if not runs:
282
+ raise ValueError(f"No runs found in experiment {experiment_name}")
283
+
284
+ return cls.load_from_mlflow(runs[0].info.run_id, artifact_path, tracking_uri)