churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,310 @@
1
+ import json
2
+ from dataclasses import asdict, dataclass
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import numpy as np
6
+
7
+ from customer_retention.core.compat import pd
8
+ from customer_retention.core.config import ColumnType
9
+ from customer_retention.core.utils.statistics import (
10
+ compute_chi_square,
11
+ compute_ks_statistic,
12
+ compute_psi_categorical,
13
+ compute_psi_numeric,
14
+ )
15
+
16
+ from .profile_result import ProfileResult
17
+
18
+
19
+ @dataclass
20
+ class DriftResult:
21
+ """Result of drift detection for a single column."""
22
+ column_name: str
23
+ has_drift: bool
24
+ severity: str # "low", "medium", "high", "critical"
25
+ metrics: Dict[str, Any]
26
+ recommendations: List[str]
27
+
28
+ def to_dict(self) -> dict:
29
+ """Convert to dictionary."""
30
+ return asdict(self)
31
+
32
+
33
+ class BaselineDriftChecker:
34
+ """Detects distribution drift between baseline and current data."""
35
+
36
+ def __init__(self):
37
+ self.baseline: Optional[Dict[str, Dict]] = None
38
+
39
+ def set_baseline(self, column_name: str, series: pd.Series, column_type: ColumnType):
40
+ """Set baseline distribution for a column."""
41
+ if self.baseline is None:
42
+ self.baseline = {}
43
+
44
+ baseline_data = {
45
+ "column_type": column_type.value,
46
+ "sample_size": len(series),
47
+ }
48
+
49
+ if column_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:
50
+ baseline_data.update(self._capture_numeric_baseline(series))
51
+ elif column_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL,
52
+ ColumnType.CATEGORICAL_CYCLICAL, ColumnType.BINARY]:
53
+ baseline_data.update(self._capture_categorical_baseline(series))
54
+
55
+ self.baseline[column_name] = baseline_data
56
+
57
+ def _capture_numeric_baseline(self, series: pd.Series) -> Dict:
58
+ """Capture baseline statistics for numeric column."""
59
+ clean_series = series.dropna()
60
+ return {
61
+ "mean": float(clean_series.mean()),
62
+ "std": float(clean_series.std()),
63
+ "median": float(clean_series.median()),
64
+ "min": float(clean_series.min()),
65
+ "max": float(clean_series.max()),
66
+ "q1": float(clean_series.quantile(0.25)),
67
+ "q3": float(clean_series.quantile(0.75)),
68
+ # Store histogram for PSI calculation
69
+ "histogram_bins": 10,
70
+ "histogram_edges": [float(x) for x in np.histogram(clean_series, bins=10)[1]],
71
+ "histogram_counts": [int(x) for x in np.histogram(clean_series, bins=10)[0]],
72
+ }
73
+
74
+ def _capture_categorical_baseline(self, series: pd.Series) -> Dict:
75
+ """Capture baseline distribution for categorical column."""
76
+ clean_series = series.dropna()
77
+ value_counts = clean_series.value_counts()
78
+ return {
79
+ "categories": value_counts.index.tolist(),
80
+ "counts": value_counts.values.tolist(),
81
+ "proportions": (value_counts / len(clean_series)).to_dict(),
82
+ }
83
+
84
+ def detect_drift(self, column_name: str, series: pd.Series, column_type: ColumnType) -> DriftResult:
85
+ """Detect drift for a single column."""
86
+ if self.baseline is None or column_name not in self.baseline:
87
+ raise ValueError(f"No baseline found for column '{column_name}'")
88
+
89
+ baseline = self.baseline[column_name]
90
+
91
+ if column_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:
92
+ return self._detect_numeric_drift(column_name, series, baseline)
93
+ elif column_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL,
94
+ ColumnType.CATEGORICAL_CYCLICAL, ColumnType.BINARY]:
95
+ return self._detect_categorical_drift(column_name, series, baseline)
96
+ else:
97
+ # Default: no drift detection for other types
98
+ return DriftResult(
99
+ column_name=column_name,
100
+ has_drift=False,
101
+ severity="low",
102
+ metrics={},
103
+ recommendations=[]
104
+ )
105
+
106
+ def _detect_numeric_drift(self, column_name: str, series: pd.Series, baseline: Dict) -> DriftResult:
107
+ """Detect drift in numeric column."""
108
+ clean_series = series.dropna()
109
+ metrics = {}
110
+ recommendations = []
111
+ baseline_sample = self._reconstruct_numeric_baseline_sample(baseline)
112
+ ks_statistic, ks_pvalue = compute_ks_statistic(pd.Series(baseline_sample), clean_series)
113
+ metrics["ks_statistic"] = ks_statistic
114
+ metrics["ks_pvalue"] = ks_pvalue
115
+ psi = compute_psi_numeric(clean_series, baseline["histogram_edges"], baseline["histogram_counts"])
116
+ metrics["psi"] = psi
117
+
118
+ # Mean shift (normalized by baseline std)
119
+ current_mean = clean_series.mean()
120
+ mean_shift = (current_mean - baseline["mean"]) / baseline["std"] if baseline["std"] > 0 else 0
121
+ metrics["mean_shift"] = float(mean_shift)
122
+
123
+ # Variance ratio
124
+ current_std = clean_series.std()
125
+ variance_ratio = current_std / baseline["std"] if baseline["std"] > 0 else 1.0
126
+ metrics["variance_ratio"] = float(variance_ratio)
127
+
128
+ # Determine drift severity based on PSI thresholds
129
+ if psi >= 0.5:
130
+ severity = "critical"
131
+ has_drift = True
132
+ recommendations.append("Critical drift detected. Model performance likely degraded significantly.")
133
+ recommendations.append("Consider retraining model with recent data.")
134
+ elif psi >= 0.2:
135
+ severity = "high"
136
+ has_drift = True
137
+ recommendations.append("Significant drift detected. Investigate data source changes.")
138
+ recommendations.append("Monitor model performance closely.")
139
+ elif psi >= 0.1:
140
+ severity = "medium"
141
+ has_drift = True
142
+ recommendations.append("Moderate drift detected. Continue monitoring.")
143
+ else:
144
+ severity = "low"
145
+ has_drift = False
146
+
147
+ # Additional checks for mean shift and variance
148
+ if abs(mean_shift) > 2:
149
+ has_drift = True
150
+ if severity == "low":
151
+ severity = "medium"
152
+ recommendations.append(f"Mean shifted by {mean_shift:.2f} standard deviations.")
153
+
154
+ if variance_ratio > 2 or variance_ratio < 0.5:
155
+ has_drift = True
156
+ if severity == "low":
157
+ severity = "medium"
158
+ recommendations.append(f"Variance changed significantly (ratio: {variance_ratio:.2f}).")
159
+
160
+ return DriftResult(
161
+ column_name=column_name,
162
+ has_drift=has_drift,
163
+ severity=severity,
164
+ metrics=metrics,
165
+ recommendations=recommendations
166
+ )
167
+
168
+ def _detect_categorical_drift(self, column_name: str, series: pd.Series, baseline: Dict) -> DriftResult:
169
+ """Detect drift in categorical column."""
170
+ clean_series = series.dropna()
171
+ metrics = {}
172
+ recommendations = []
173
+
174
+ # Get current distribution
175
+ current_counts = clean_series.value_counts()
176
+ current_categories = set(current_counts.index.tolist())
177
+ baseline_categories = set(baseline["categories"])
178
+
179
+ # New and missing categories
180
+ new_categories = current_categories - baseline_categories
181
+ missing_categories = baseline_categories - current_categories
182
+
183
+ metrics["new_categories"] = list(new_categories)
184
+ metrics["missing_categories"] = list(missing_categories)
185
+
186
+ psi = compute_psi_categorical(pd.Series(baseline["categories"]).repeat([baseline["counts"][i] for i in range(len(baseline["categories"]))]), clean_series)
187
+ metrics["psi"] = psi
188
+ chi_square_stat, chi_pvalue = compute_chi_square(clean_series, baseline["proportions"])
189
+ metrics["chi_square_statistic"] = chi_square_stat
190
+ metrics["chi_square_pvalue"] = chi_pvalue
191
+
192
+ # Determine severity
193
+ if psi >= 0.5:
194
+ severity = "critical"
195
+ has_drift = True
196
+ recommendations.append("Critical distribution shift detected.")
197
+ elif psi >= 0.2:
198
+ severity = "high"
199
+ has_drift = True
200
+ recommendations.append("Significant distribution change detected.")
201
+ elif psi >= 0.1:
202
+ severity = "medium"
203
+ has_drift = True
204
+ recommendations.append("Moderate distribution change detected.")
205
+ else:
206
+ severity = "low"
207
+ has_drift = bool(chi_pvalue < 0.05) # Convert numpy bool to Python bool
208
+
209
+ # Check for new/missing categories
210
+ if new_categories:
211
+ has_drift = True
212
+ if severity == "low":
213
+ severity = "medium"
214
+ recommendations.append(f"New categories detected: {', '.join(new_categories)}")
215
+
216
+ if missing_categories:
217
+ has_drift = True
218
+ if severity == "low":
219
+ severity = "medium"
220
+ recommendations.append(f"Missing categories: {', '.join(missing_categories)}")
221
+
222
+ return DriftResult(
223
+ column_name=column_name,
224
+ has_drift=has_drift,
225
+ severity=severity,
226
+ metrics=metrics,
227
+ recommendations=recommendations
228
+ )
229
+
230
+ def _reconstruct_numeric_baseline_sample(self, baseline: Dict) -> np.ndarray:
231
+ """Reconstruct a sample from baseline histogram for KS test."""
232
+ edges = baseline["histogram_edges"]
233
+ counts = baseline["histogram_counts"]
234
+
235
+ # Generate samples from each bin
236
+ samples = []
237
+ for i, count in enumerate(counts):
238
+ if count > 0:
239
+ # Sample uniformly within each bin
240
+ bin_samples = np.random.uniform(edges[i], edges[i + 1], count)
241
+ samples.extend(bin_samples)
242
+
243
+ return np.array(samples)
244
+
245
+ def detect_drift_all(self, df: pd.DataFrame) -> List[DriftResult]:
246
+ """Detect drift for all columns with baseline."""
247
+ if self.baseline is None:
248
+ raise ValueError("No baseline set. Call set_baseline first.")
249
+
250
+ results = []
251
+ for column_name in self.baseline.keys():
252
+ if column_name in df.columns:
253
+ column_type = ColumnType(self.baseline[column_name]["column_type"])
254
+ result = self.detect_drift(column_name, df[column_name], column_type)
255
+ results.append(result)
256
+
257
+ return results
258
+
259
+ def set_baseline_from_profile(self, profile: ProfileResult):
260
+ """Set baseline from a ProfileResult."""
261
+ self.baseline = {}
262
+
263
+ for column_name, column_profile in profile.column_profiles.items():
264
+ # Create a mock series for baseline (we'll use the metrics instead)
265
+ baseline_data = {
266
+ "column_type": column_profile.configured_type.value,
267
+ "sample_size": profile.total_rows,
268
+ }
269
+
270
+ if column_profile.numeric_metrics:
271
+ metrics = column_profile.numeric_metrics
272
+ baseline_data.update({
273
+ "mean": metrics.mean,
274
+ "std": metrics.std,
275
+ "median": metrics.median,
276
+ "min": metrics.min_value,
277
+ "max": metrics.max_value,
278
+ "q1": metrics.q1,
279
+ "q3": metrics.q3,
280
+ "histogram_bins": 10,
281
+ "histogram_edges": metrics.histogram_edges if hasattr(metrics, 'histogram_edges') else [],
282
+ "histogram_counts": metrics.histogram_counts if hasattr(metrics, 'histogram_counts') else [],
283
+ })
284
+
285
+ elif column_profile.categorical_metrics:
286
+ metrics = column_profile.categorical_metrics
287
+ categories = list(metrics.value_counts.keys()) if metrics.value_counts else []
288
+ counts = list(metrics.value_counts.values()) if metrics.value_counts else []
289
+ total = sum(counts) if counts else 1
290
+
291
+ baseline_data.update({
292
+ "categories": categories,
293
+ "counts": counts,
294
+ "proportions": {cat: count / total for cat, count in zip(categories, counts)},
295
+ })
296
+
297
+ self.baseline[column_name] = baseline_data
298
+
299
+ def save_baseline(self, filepath: str):
300
+ """Save baseline to JSON file."""
301
+ if self.baseline is None:
302
+ raise ValueError("No baseline to save")
303
+
304
+ with open(filepath, 'w') as f:
305
+ json.dump(self.baseline, f, indent=2)
306
+
307
+ def load_baseline(self, filepath: str):
308
+ """Load baseline from JSON file."""
309
+ with open(filepath, 'r') as f:
310
+ self.baseline = json.load(f)