churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,199 @@
1
+ """Customer risk profiling."""
2
+
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+ from typing import Any, List, Optional
6
+
7
+ import numpy as np
8
+ import shap
9
+
10
+ from customer_retention.core.compat import DataFrame, Series
11
+ from customer_retention.core.components.enums import RiskSegment
12
+
13
+
14
+ class Urgency(Enum):
15
+ IMMEDIATE = "Immediate"
16
+ THIS_WEEK = "This Week"
17
+ THIS_MONTH = "This Month"
18
+ MONITOR = "Monitor"
19
+
20
+
21
+ @dataclass
22
+ class RiskFactor:
23
+ factor_name: str
24
+ current_value: str
25
+ comparison: str
26
+ impact: str
27
+ actionable: bool
28
+ suggested_action: str = ""
29
+
30
+
31
+ @dataclass
32
+ class Intervention:
33
+ intervention_type: str
34
+ description: str
35
+ estimated_cost: float
36
+ estimated_success_rate: float
37
+ expected_roi: float
38
+ priority: int
39
+ reasoning: str
40
+ channel: str
41
+ timing: str
42
+
43
+
44
+ @dataclass
45
+ class CustomerRiskProfile:
46
+ customer_id: Optional[str]
47
+ churn_probability: float
48
+ risk_segment: RiskSegment
49
+ confidence: str
50
+ risk_factors: List[RiskFactor]
51
+ recommended_interventions: List[Intervention]
52
+ expected_ltv_if_retained: float
53
+ expected_ltv_if_churned: float
54
+ intervention_roi_estimate: float
55
+ urgency: Urgency
56
+ days_until_likely_churn: Optional[int] = None
57
+
58
+
59
+ class RiskProfiler:
60
+ SEGMENT_THRESHOLDS = [(0.80, RiskSegment.CRITICAL), (0.60, RiskSegment.HIGH),
61
+ (0.40, RiskSegment.MEDIUM), (0.20, RiskSegment.LOW)]
62
+ INTERVENTION_CATALOG = [
63
+ {"name": "email_campaign", "cost": 2, "success_rate": 0.10, "channel": "email",
64
+ "segments": [RiskSegment.LOW, RiskSegment.MEDIUM]},
65
+ {"name": "phone_call", "cost": 15, "success_rate": 0.25, "channel": "phone",
66
+ "segments": [RiskSegment.MEDIUM, RiskSegment.HIGH]},
67
+ {"name": "discount_offer", "cost": 25, "success_rate": 0.35, "channel": "email",
68
+ "segments": [RiskSegment.HIGH, RiskSegment.CRITICAL]},
69
+ {"name": "account_manager", "cost": 150, "success_rate": 0.60, "channel": "personal",
70
+ "segments": [RiskSegment.CRITICAL]},
71
+ ]
72
+
73
+ def __init__(self, model: Any, background_data: DataFrame,
74
+ actionable_features: Optional[List[str]] = None,
75
+ avg_customer_ltv: float = 500, max_samples: int = 100):
76
+ self.model = model
77
+ self.background_data = background_data.head(max_samples)
78
+ self.actionable_features = actionable_features or []
79
+ self.avg_ltv = avg_customer_ltv
80
+ self.feature_names = list(background_data.columns)
81
+ self._explainer = self._create_explainer()
82
+
83
+ def _create_explainer(self) -> shap.Explainer:
84
+ model_type = type(self.model).__name__
85
+ if model_type in ["RandomForestClassifier", "GradientBoostingClassifier"]:
86
+ return shap.TreeExplainer(self.model)
87
+ return shap.KernelExplainer(self.model.predict_proba, self.background_data)
88
+
89
+ def generate_profile(self, instance: Series,
90
+ customer_id: Optional[str] = None) -> CustomerRiskProfile:
91
+ instance_df = instance.to_frame().T
92
+ churn_prob = float(self.model.predict_proba(instance_df)[0, 1])
93
+ segment = self._assign_segment(churn_prob)
94
+ confidence = self._assess_confidence(churn_prob)
95
+ risk_factors = self._extract_risk_factors(instance)
96
+ interventions = self._match_interventions(segment, churn_prob)
97
+ ltv_retained = self.avg_ltv
98
+ ltv_churned = self.avg_ltv * 0.1
99
+ best_intervention = interventions[0] if interventions else None
100
+ roi = best_intervention.expected_roi if best_intervention else 0
101
+ urgency = self._assign_urgency(segment)
102
+ return CustomerRiskProfile(
103
+ customer_id=customer_id,
104
+ churn_probability=churn_prob,
105
+ risk_segment=segment,
106
+ confidence=confidence,
107
+ risk_factors=risk_factors,
108
+ recommended_interventions=interventions,
109
+ expected_ltv_if_retained=ltv_retained,
110
+ expected_ltv_if_churned=ltv_churned,
111
+ intervention_roi_estimate=roi,
112
+ urgency=urgency
113
+ )
114
+
115
+ def _assign_segment(self, probability: float) -> RiskSegment:
116
+ for threshold, segment in self.SEGMENT_THRESHOLDS:
117
+ if probability >= threshold:
118
+ return segment
119
+ return RiskSegment.VERY_LOW
120
+
121
+ def _assess_confidence(self, probability: float) -> str:
122
+ if probability < 0.2 or probability > 0.8:
123
+ return "High"
124
+ if 0.4 < probability < 0.6:
125
+ return "Low"
126
+ return "Medium"
127
+
128
+ def _extract_risk_factors(self, instance: Series) -> List[RiskFactor]:
129
+ instance_df = instance.to_frame().T
130
+ shap_values = self._extract_shap_values(instance_df)
131
+ sorted_indices = np.argsort(np.abs(shap_values))[::-1]
132
+ factors = []
133
+ for idx in sorted_indices[:5]:
134
+ feature = self.feature_names[idx]
135
+ value = instance[feature]
136
+ impact_pct = abs(shap_values[idx]) * 100
137
+ direction = "increases" if shap_values[idx] > 0 else "decreases"
138
+ factors.append(RiskFactor(
139
+ factor_name=feature,
140
+ current_value=f"{value:.2f}" if isinstance(value, float) else str(value),
141
+ comparison=f"vs avg {self.background_data[feature].mean():.2f}",
142
+ impact=f"{direction} risk by {impact_pct:.1f}%",
143
+ actionable=feature in self.actionable_features,
144
+ suggested_action=f"Improve {feature}" if feature in self.actionable_features else ""
145
+ ))
146
+ return factors
147
+
148
+ def _extract_shap_values(self, X: DataFrame) -> np.ndarray:
149
+ shap_values = self._explainer.shap_values(X)
150
+ if hasattr(shap_values, 'values'):
151
+ shap_values = shap_values.values
152
+ if isinstance(shap_values, list):
153
+ shap_values = shap_values[1]
154
+ if len(shap_values.shape) == 3:
155
+ shap_values = shap_values[:, :, 1]
156
+ return shap_values.flatten()
157
+
158
+ def _match_interventions(self, segment: RiskSegment, churn_prob: float) -> List[Intervention]:
159
+ applicable = [i for i in self.INTERVENTION_CATALOG if segment in i["segments"]]
160
+ interventions = []
161
+ for item in applicable:
162
+ expected_saves = churn_prob * item["success_rate"]
163
+ revenue_saved = expected_saves * self.avg_ltv
164
+ roi = (revenue_saved - item["cost"]) / item["cost"] if item["cost"] > 0 else 0
165
+ interventions.append(Intervention(
166
+ intervention_type=item["name"],
167
+ description=f"{item['name'].replace('_', ' ').title()} via {item['channel']}",
168
+ estimated_cost=item["cost"],
169
+ estimated_success_rate=item["success_rate"],
170
+ expected_roi=roi,
171
+ priority=self._get_priority(segment),
172
+ reasoning=f"Recommended for {segment.value} risk customers",
173
+ channel=item["channel"],
174
+ timing="Within 24 hours" if segment == RiskSegment.CRITICAL else "Within 1 week"
175
+ ))
176
+ return sorted(interventions, key=lambda x: x.expected_roi, reverse=True)
177
+
178
+ def _get_priority(self, segment: RiskSegment) -> int:
179
+ priorities = {RiskSegment.CRITICAL: 1, RiskSegment.HIGH: 2,
180
+ RiskSegment.MEDIUM: 3, RiskSegment.LOW: 4, RiskSegment.VERY_LOW: 5}
181
+ return priorities.get(segment, 5)
182
+
183
+ def _assign_urgency(self, segment: RiskSegment) -> Urgency:
184
+ urgency_map = {
185
+ RiskSegment.CRITICAL: Urgency.IMMEDIATE,
186
+ RiskSegment.HIGH: Urgency.THIS_WEEK,
187
+ RiskSegment.MEDIUM: Urgency.THIS_MONTH,
188
+ RiskSegment.LOW: Urgency.MONITOR,
189
+ RiskSegment.VERY_LOW: Urgency.MONITOR
190
+ }
191
+ return urgency_map.get(segment, Urgency.MONITOR)
192
+
193
+ def generate_batch(self, X: DataFrame, customer_ids: Optional[List[str]] = None,
194
+ sort_by_risk: bool = False) -> List[CustomerRiskProfile]:
195
+ customer_ids = customer_ids or [None] * len(X)
196
+ profiles = [self.generate_profile(X.iloc[i], customer_ids[i]) for i in range(len(X))]
197
+ if sort_by_risk:
198
+ profiles.sort(key=lambda p: p.churn_probability, reverse=True)
199
+ return profiles
@@ -0,0 +1,139 @@
1
+ """ROI analysis for retention interventions."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Dict, List
5
+
6
+
7
+ @dataclass
8
+ class InterventionROI:
9
+ intervention: str
10
+ targeted_customers: int
11
+ actual_churners: float
12
+ customers_saved: float
13
+ total_cost: float
14
+ revenue_saved: float
15
+ net_benefit: float
16
+ roi_pct: float
17
+
18
+
19
+ @dataclass
20
+ class OptimizationResult:
21
+ allocations: Dict[str, Dict[str, any]]
22
+ total_cost: float
23
+ total_saves: float
24
+ total_revenue: float
25
+ overall_roi: float
26
+
27
+
28
+ @dataclass
29
+ class ROIResult:
30
+ intervention_rois: List[InterventionROI]
31
+ best_intervention: str
32
+ best_roi: float
33
+
34
+
35
+ class ROIAnalyzer:
36
+ def __init__(self, avg_ltv: float, intervention_costs: Dict[str, float],
37
+ success_rates: Dict[str, float]):
38
+ self.avg_ltv = avg_ltv
39
+ self.intervention_costs = intervention_costs
40
+ self.success_rates = success_rates
41
+
42
+ def calculate_roi(self, intervention: str, targeted_customers: int,
43
+ actual_churn_rate: float) -> InterventionROI:
44
+ cost = self.intervention_costs.get(intervention, 0)
45
+ success_rate = self.success_rates.get(intervention, 0)
46
+ actual_churners = targeted_customers * actual_churn_rate
47
+ customers_saved = actual_churners * success_rate
48
+ revenue_saved = customers_saved * self.avg_ltv
49
+ total_cost = targeted_customers * cost
50
+ net_benefit = revenue_saved - total_cost
51
+ roi_pct = (net_benefit / total_cost * 100) if total_cost > 0 else 0
52
+ return InterventionROI(
53
+ intervention=intervention,
54
+ targeted_customers=targeted_customers,
55
+ actual_churners=actual_churners,
56
+ customers_saved=customers_saved,
57
+ total_cost=total_cost,
58
+ revenue_saved=revenue_saved,
59
+ net_benefit=net_benefit,
60
+ roi_pct=roi_pct
61
+ )
62
+
63
+ def analyze_all_interventions(self, targeted_customers: int,
64
+ actual_churn_rate: float) -> List[InterventionROI]:
65
+ results = [self.calculate_roi(intervention, targeted_customers, actual_churn_rate)
66
+ for intervention in self.intervention_costs.keys()]
67
+ return sorted(results, key=lambda r: r.roi_pct, reverse=True)
68
+
69
+ def compare_interventions(self, targeted_customers: int,
70
+ actual_churn_rate: float) -> List[InterventionROI]:
71
+ return self.analyze_all_interventions(targeted_customers, actual_churn_rate)
72
+
73
+ def analyze_by_segment(self, segment_data: Dict[str, Dict]) -> Dict[str, List[InterventionROI]]:
74
+ results = {}
75
+ for segment, data in segment_data.items():
76
+ customers = data["customers"]
77
+ churn_rate = data["churn_rate"]
78
+ results[segment] = self.analyze_all_interventions(customers, churn_rate)
79
+ return results
80
+
81
+ def optimize_budget(self, segment_data: Dict[str, Dict], total_budget: float,
82
+ objective: str = "maximize_roi") -> OptimizationResult:
83
+ all_options = []
84
+ for segment, data in segment_data.items():
85
+ customers = data["customers"]
86
+ churn_rate = data["churn_rate"]
87
+ for intervention in self.intervention_costs.keys():
88
+ cost_per = self.intervention_costs[intervention]
89
+ success_rate = self.success_rates[intervention]
90
+ all_options.append({
91
+ "segment": segment,
92
+ "intervention": intervention,
93
+ "customers": customers,
94
+ "churn_rate": churn_rate,
95
+ "cost_per": cost_per,
96
+ "success_rate": success_rate,
97
+ "total_cost": customers * cost_per,
98
+ "expected_saves": customers * churn_rate * success_rate,
99
+ "expected_revenue": customers * churn_rate * success_rate * self.avg_ltv
100
+ })
101
+ for opt in all_options:
102
+ if opt["total_cost"] > 0:
103
+ opt["roi"] = (opt["expected_revenue"] - opt["total_cost"]) / opt["total_cost"]
104
+ else:
105
+ opt["roi"] = 0
106
+ if objective == "maximize_roi":
107
+ all_options.sort(key=lambda x: x["roi"], reverse=True)
108
+ else:
109
+ all_options.sort(key=lambda x: x["expected_saves"], reverse=True)
110
+ allocations = {}
111
+ remaining_budget = total_budget
112
+ total_saves = 0
113
+ total_revenue = 0
114
+ total_cost = 0
115
+ for opt in all_options:
116
+ if opt["total_cost"] <= remaining_budget and opt["segment"] not in allocations:
117
+ allocations[opt["segment"]] = {
118
+ "intervention": opt["intervention"],
119
+ "customers": opt["customers"],
120
+ "cost": opt["total_cost"],
121
+ "expected_saves": opt["expected_saves"],
122
+ "expected_revenue": opt["expected_revenue"]
123
+ }
124
+ remaining_budget -= opt["total_cost"]
125
+ total_saves += opt["expected_saves"]
126
+ total_revenue += opt["expected_revenue"]
127
+ total_cost += opt["total_cost"]
128
+ overall_roi = (total_revenue - total_cost) / total_cost if total_cost > 0 else 0
129
+ return OptimizationResult(
130
+ allocations=allocations,
131
+ total_cost=total_cost,
132
+ total_saves=total_saves,
133
+ total_revenue=total_revenue,
134
+ overall_roi=overall_roi
135
+ )
136
+
137
+ def run_scenarios(self, intervention: str, targeted_customers: int,
138
+ churn_rates: List[float]) -> List[InterventionROI]:
139
+ return [self.calculate_roi(intervention, targeted_customers, rate) for rate in churn_rates]
@@ -0,0 +1,20 @@
1
+ from customer_retention.core.components.enums import Severity
2
+
3
+ from .calibration_analyzer import CalibrationAnalyzer, CalibrationCheck, CalibrationResult
4
+ from .cv_analyzer import CVAnalysisResult, CVAnalyzer, CVCheck
5
+ from .error_analyzer import ErrorAnalysisResult, ErrorAnalyzer, ErrorPattern
6
+ from .leakage_detector import LeakageCheck, LeakageDetector, LeakageResult
7
+ from .noise_tester import NoiseResult, NoiseTester
8
+ from .overfitting_analyzer import OverfittingAnalyzer, OverfittingCheck, OverfittingResult
9
+ from .segment_analyzer import SegmentCheck, SegmentPerformanceAnalyzer, SegmentResult
10
+
11
+ __all__ = [
12
+ "Severity",
13
+ "LeakageDetector", "LeakageResult", "LeakageCheck",
14
+ "OverfittingAnalyzer", "OverfittingResult", "OverfittingCheck",
15
+ "CVAnalyzer", "CVAnalysisResult", "CVCheck",
16
+ "SegmentPerformanceAnalyzer", "SegmentResult", "SegmentCheck",
17
+ "CalibrationAnalyzer", "CalibrationResult", "CalibrationCheck",
18
+ "ErrorAnalyzer", "ErrorAnalysisResult", "ErrorPattern",
19
+ "NoiseTester", "NoiseResult",
20
+ ]
@@ -0,0 +1,133 @@
1
+ """Calibration analysis probes for model validation."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Dict, List
5
+
6
+ import numpy as np
7
+
8
+ from customer_retention.core.components.enums import Severity
9
+
10
+
11
+ @dataclass
12
+ class CalibrationCheck:
13
+ check_id: str
14
+ metric: str
15
+ severity: Severity
16
+ recommendation: str
17
+ value: float = 0.0
18
+
19
+
20
+ @dataclass
21
+ class CalibrationResult:
22
+ passed: bool
23
+ checks: List[CalibrationCheck] = field(default_factory=list)
24
+ brier_score: float = 0.0
25
+ ece: float = 0.0
26
+ mce: float = 0.0
27
+ reliability_data: List[Dict[str, float]] = field(default_factory=list)
28
+ recommendation: str = ""
29
+
30
+
31
+ class CalibrationAnalyzer:
32
+ BRIER_HIGH = 0.20
33
+ BRIER_MEDIUM = 0.15
34
+ ECE_HIGH = 0.10
35
+ MCE_HIGH = 0.30
36
+ N_BINS = 10
37
+
38
+ def analyze_brier(self, y_true: np.ndarray, y_proba: np.ndarray) -> CalibrationResult:
39
+ brier = np.mean((y_proba - y_true) ** 2)
40
+ checks = []
41
+ severity, check_id = self._classify_brier(brier)
42
+ if severity != Severity.INFO:
43
+ checks.append(CalibrationCheck(
44
+ check_id=check_id,
45
+ metric="brier_score",
46
+ severity=severity,
47
+ recommendation=self._brier_recommendation(brier),
48
+ value=brier,
49
+ ))
50
+ critical = [c for c in checks if c.severity == Severity.CRITICAL]
51
+ return CalibrationResult(passed=len(critical) == 0, checks=checks, brier_score=brier)
52
+
53
+ def _classify_brier(self, brier: float) -> tuple:
54
+ if brier > self.BRIER_HIGH:
55
+ return Severity.HIGH, "CA001"
56
+ if brier > self.BRIER_MEDIUM:
57
+ return Severity.MEDIUM, "CA002"
58
+ return Severity.INFO, "CA000"
59
+
60
+ def _brier_recommendation(self, brier: float) -> str:
61
+ if brier > self.BRIER_HIGH:
62
+ return f"HIGH: Brier score {brier:.3f} is poor. Apply calibration (Platt scaling or isotonic)."
63
+ if brier > self.BRIER_MEDIUM:
64
+ return f"MEDIUM: Brier score {brier:.3f} is moderate. Consider calibration."
65
+ return f"OK: Brier score {brier:.3f} is acceptable."
66
+
67
+ def analyze_calibration(self, y_true: np.ndarray, y_proba: np.ndarray) -> CalibrationResult:
68
+ brier = np.mean((y_proba - y_true) ** 2)
69
+ reliability_data, ece, mce = self._compute_reliability(y_true, y_proba)
70
+ checks = []
71
+ brier_severity, brier_id = self._classify_brier(brier)
72
+ if brier_severity != Severity.INFO:
73
+ checks.append(CalibrationCheck(
74
+ check_id=brier_id, metric="brier_score", severity=brier_severity,
75
+ recommendation=self._brier_recommendation(brier), value=brier,
76
+ ))
77
+ if ece > self.ECE_HIGH:
78
+ checks.append(CalibrationCheck(
79
+ check_id="CA003", metric="ece", severity=Severity.MEDIUM,
80
+ recommendation=f"MEDIUM: ECE {ece:.3f} is high. Calibration recommended.", value=ece,
81
+ ))
82
+ if mce > self.MCE_HIGH:
83
+ checks.append(CalibrationCheck(
84
+ check_id="CA004", metric="mce", severity=Severity.HIGH,
85
+ recommendation=f"HIGH: MCE {mce:.3f} is extreme. Some probability bins are very miscalibrated.", value=mce,
86
+ ))
87
+ critical = [c for c in checks if c.severity == Severity.CRITICAL]
88
+ recommendation = self._global_recommendation(reliability_data, brier, ece)
89
+ return CalibrationResult(
90
+ passed=len(critical) == 0,
91
+ checks=checks,
92
+ brier_score=brier,
93
+ ece=ece,
94
+ mce=mce,
95
+ reliability_data=reliability_data,
96
+ recommendation=recommendation,
97
+ )
98
+
99
+ def _compute_reliability(self, y_true: np.ndarray, y_proba: np.ndarray) -> tuple:
100
+ bin_edges = np.linspace(0, 1, self.N_BINS + 1)
101
+ reliability_data = []
102
+ ece_sum = 0.0
103
+ mce = 0.0
104
+ for i in range(self.N_BINS):
105
+ mask = (y_proba >= bin_edges[i]) & (y_proba < bin_edges[i + 1])
106
+ if i == self.N_BINS - 1:
107
+ mask = (y_proba >= bin_edges[i]) & (y_proba <= bin_edges[i + 1])
108
+ if mask.sum() > 0:
109
+ predicted = y_proba[mask].mean()
110
+ actual = y_true[mask].mean()
111
+ bin_size = mask.sum()
112
+ reliability_data.append({
113
+ "bin": i,
114
+ "predicted_prob": float(predicted),
115
+ "actual_prob": float(actual),
116
+ "count": int(bin_size),
117
+ })
118
+ error = abs(predicted - actual)
119
+ ece_sum += error * bin_size
120
+ mce = max(mce, error)
121
+ ece = ece_sum / len(y_true) if len(y_true) > 0 else 0.0
122
+ return reliability_data, ece, mce
123
+
124
+ def _global_recommendation(self, reliability_data: List[Dict], brier: float, ece: float) -> str:
125
+ if brier < 0.10 and ece < 0.05:
126
+ return "Well calibrated. No action needed."
127
+ above_diagonal = sum(1 for b in reliability_data if b["predicted_prob"] > b["actual_prob"] + 0.05)
128
+ below_diagonal = sum(1 for b in reliability_data if b["predicted_prob"] < b["actual_prob"] - 0.05)
129
+ if above_diagonal > below_diagonal:
130
+ return "Overconfident predictions. Apply Platt scaling."
131
+ if below_diagonal > above_diagonal:
132
+ return "Underconfident predictions. Consider isotonic regression."
133
+ return "Apply CalibratedClassifierCV for general calibration improvement."
@@ -0,0 +1,144 @@
1
+ """Cross-validation stability analysis probes."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Dict, List, Optional
5
+
6
+ import numpy as np
7
+
8
+ from customer_retention.core.components.enums import Severity
9
+
10
+
11
+ @dataclass
12
+ class CVCheck:
13
+ check_id: str
14
+ metric: str
15
+ severity: Severity
16
+ recommendation: str
17
+ value: float = 0.0
18
+
19
+
20
+ @dataclass
21
+ class CVAnalysisResult:
22
+ passed: bool
23
+ checks: List[CVCheck] = field(default_factory=list)
24
+ cv_mean: float = 0.0
25
+ cv_std: float = 0.0
26
+ fold_analysis: List[Dict[str, float]] = field(default_factory=list)
27
+ best_worst_gap: float = 0.0
28
+ outlier_folds: List[int] = field(default_factory=list)
29
+ recommendations: List[str] = field(default_factory=list)
30
+
31
+
32
+ class CVAnalyzer:
33
+ STD_CRITICAL = 0.15
34
+ STD_HIGH = 0.10
35
+ STD_MEDIUM = 0.05
36
+ CV_TEST_HIGH = 0.10
37
+ CV_TEST_MEDIUM = -0.10
38
+
39
+ def analyze_variance(self, cv_scores: List[float]) -> CVAnalysisResult:
40
+ cv_mean = np.mean(cv_scores)
41
+ cv_std = np.std(cv_scores)
42
+ checks = []
43
+ severity, check_id = self._classify_variance(cv_std)
44
+ checks.append(CVCheck(
45
+ check_id=check_id,
46
+ metric="cv_std",
47
+ severity=severity,
48
+ recommendation=self._variance_recommendation(cv_std),
49
+ value=cv_std,
50
+ ))
51
+ critical = [c for c in checks if c.severity == Severity.CRITICAL]
52
+ return CVAnalysisResult(passed=len(critical) == 0, checks=checks, cv_mean=cv_mean, cv_std=cv_std)
53
+
54
+ def _classify_variance(self, cv_std: float) -> tuple:
55
+ if cv_std > self.STD_CRITICAL:
56
+ return Severity.CRITICAL, "CV001"
57
+ if cv_std > self.STD_HIGH:
58
+ return Severity.HIGH, "CV002"
59
+ if cv_std > self.STD_MEDIUM:
60
+ return Severity.MEDIUM, "CV003"
61
+ return Severity.INFO, "CV004"
62
+
63
+ def _variance_recommendation(self, cv_std: float) -> str:
64
+ if cv_std > self.STD_CRITICAL:
65
+ return f"CRITICAL: CV std {cv_std:.3f} is very high. Model is unstable. Use more data or robust methods."
66
+ if cv_std > self.STD_HIGH:
67
+ return f"HIGH: CV std {cv_std:.3f} is high. Consider ensemble methods or robust scaling."
68
+ if cv_std > self.STD_MEDIUM:
69
+ return f"MEDIUM: CV std {cv_std:.3f} is moderate. Monitor closely."
70
+ return f"OK: CV std {cv_std:.3f} indicates stable model."
71
+
72
+ def analyze_folds(self, cv_scores: List[float]) -> CVAnalysisResult:
73
+ cv_mean = np.mean(cv_scores)
74
+ cv_std = np.std(cv_scores)
75
+ fold_analysis = [{"fold": i, "score": score, "deviation": score - cv_mean} for i, score in enumerate(cv_scores)]
76
+ best_worst_gap = max(cv_scores) - min(cv_scores)
77
+ outlier_folds = [i for i, score in enumerate(cv_scores) if abs(score - cv_mean) > 2 * cv_std]
78
+ checks = []
79
+ if outlier_folds:
80
+ checks.append(CVCheck(
81
+ check_id="CV005",
82
+ metric="outlier_folds",
83
+ severity=Severity.HIGH,
84
+ recommendation=f"HIGH: Folds {outlier_folds} are outliers (>2 std from mean). Investigate data heterogeneity.",
85
+ value=len(outlier_folds),
86
+ ))
87
+ critical = [c for c in checks if c.severity == Severity.CRITICAL]
88
+ return CVAnalysisResult(
89
+ passed=len(critical) == 0,
90
+ checks=checks,
91
+ cv_mean=cv_mean,
92
+ cv_std=cv_std,
93
+ fold_analysis=fold_analysis,
94
+ best_worst_gap=best_worst_gap,
95
+ outlier_folds=outlier_folds,
96
+ )
97
+
98
+ def compare_cv_test(self, cv_mean: float, test_score: float) -> CVAnalysisResult:
99
+ gap = cv_mean - test_score
100
+ checks = []
101
+ severity, check_id = self._classify_cv_test_gap(gap)
102
+ checks.append(CVCheck(
103
+ check_id=check_id,
104
+ metric="cv_test_gap",
105
+ severity=severity,
106
+ recommendation=self._cv_test_recommendation(gap, cv_mean, test_score),
107
+ value=gap,
108
+ ))
109
+ critical = [c for c in checks if c.severity == Severity.CRITICAL]
110
+ return CVAnalysisResult(passed=len(critical) == 0, checks=checks, cv_mean=cv_mean)
111
+
112
+ def _classify_cv_test_gap(self, gap: float) -> tuple:
113
+ if gap > self.CV_TEST_HIGH:
114
+ return Severity.HIGH, "CV010"
115
+ if gap < self.CV_TEST_MEDIUM:
116
+ return Severity.MEDIUM, "CV011"
117
+ return Severity.INFO, "CV012"
118
+
119
+ def _cv_test_recommendation(self, gap: float, cv_mean: float, test_score: float) -> str:
120
+ if gap > self.CV_TEST_HIGH:
121
+ return f"HIGH: CV mean {cv_mean:.3f} >> test {test_score:.3f}. CV may be overly optimistic."
122
+ if gap < self.CV_TEST_MEDIUM:
123
+ return f"MEDIUM: CV mean {cv_mean:.3f} << test {test_score:.3f}. CV may be pessimistic."
124
+ return f"OK: CV mean {cv_mean:.3f} ≈ test {test_score:.3f}. Good estimate."
125
+
126
+ def run_all(self, cv_scores: List[float], test_score: Optional[float] = None) -> CVAnalysisResult:
127
+ variance_result = self.analyze_variance(cv_scores)
128
+ fold_result = self.analyze_folds(cv_scores)
129
+ all_checks = variance_result.checks + fold_result.checks
130
+ if test_score is not None:
131
+ cv_test_result = self.compare_cv_test(variance_result.cv_mean, test_score)
132
+ all_checks += cv_test_result.checks
133
+ critical = [c for c in all_checks if c.severity == Severity.CRITICAL]
134
+ recommendations = [c.recommendation for c in all_checks if c.severity in [Severity.CRITICAL, Severity.HIGH]]
135
+ return CVAnalysisResult(
136
+ passed=len(critical) == 0,
137
+ checks=all_checks,
138
+ cv_mean=variance_result.cv_mean,
139
+ cv_std=variance_result.cv_std,
140
+ fold_analysis=fold_result.fold_analysis,
141
+ best_worst_gap=fold_result.best_worst_gap,
142
+ outlier_folds=fold_result.outlier_folds,
143
+ recommendations=recommendations,
144
+ )