modelshift 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modelshift-0.1.0/PKG-INFO +129 -0
- modelshift-0.1.0/README.md +106 -0
- modelshift-0.1.0/modelshift/__init__.py +3 -0
- modelshift-0.1.0/modelshift/baseline.py +37 -0
- modelshift-0.1.0/modelshift/drift/__init__.py +3 -0
- modelshift-0.1.0/modelshift/drift/feature_drift.py +50 -0
- modelshift-0.1.0/modelshift/drift/prediction_drift.py +111 -0
- modelshift-0.1.0/modelshift/drift/severity.py +239 -0
- modelshift-0.1.0/modelshift/monitor.py +317 -0
- modelshift-0.1.0/modelshift/selftest.py +398 -0
- modelshift-0.1.0/modelshift/storage/__init__.py +3 -0
- modelshift-0.1.0/modelshift/storage/sqlite_store.py +13 -0
- modelshift-0.1.0/modelshift/utils/__init__.py +3 -0
- modelshift-0.1.0/modelshift/utils/helpers.py +5 -0
- modelshift-0.1.0/modelshift.egg-info/PKG-INFO +129 -0
- modelshift-0.1.0/modelshift.egg-info/SOURCES.txt +23 -0
- modelshift-0.1.0/modelshift.egg-info/dependency_links.txt +1 -0
- modelshift-0.1.0/modelshift.egg-info/requires.txt +4 -0
- modelshift-0.1.0/modelshift.egg-info/top_level.txt +1 -0
- modelshift-0.1.0/setup.cfg +4 -0
- modelshift-0.1.0/setup.py +29 -0
- modelshift-0.1.0/tests/test_feature_drift.py +50 -0
- modelshift-0.1.0/tests/test_monitor_integration.py +29 -0
- modelshift-0.1.0/tests/test_prediction_drift.py +49 -0
- modelshift-0.1.0/tests/test_severity.py +56 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: modelshift
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight machine learning drift monitoring and alerting engine.
|
|
5
|
+
Author: Krishna
|
|
6
|
+
Author-email: ryomensukuna2530@gmail.com
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: pandas
|
|
12
|
+
Requires-Dist: numpy
|
|
13
|
+
Requires-Dist: scipy
|
|
14
|
+
Requires-Dist: requests
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: author-email
|
|
17
|
+
Dynamic: classifier
|
|
18
|
+
Dynamic: description
|
|
19
|
+
Dynamic: description-content-type
|
|
20
|
+
Dynamic: requires-dist
|
|
21
|
+
Dynamic: requires-python
|
|
22
|
+
Dynamic: summary
|
|
23
|
+
|
|
24
|
+
# 🚦 ModelShift-Lite
|
|
25
|
+
### Label-Free Monitoring for Deployed Machine Learning Models
|
|
26
|
+
|
|
27
|
+
> A lightweight, behavior-centric system to detect **silent reliability degradation** in deployed machine learning models — without requiring ground-truth labels.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## 📌 Why ModelShift-Lite?
|
|
32
|
+
|
|
33
|
+
Machine learning models rarely fail loudly after deployment.
|
|
34
|
+
Instead, they **silently degrade** as real-world data changes — while true labels are unavailable for continuous evaluation.
|
|
35
|
+
|
|
36
|
+
**ModelShift-Lite addresses this blind spot.**
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## 🧩 Problem Statement
|
|
41
|
+
|
|
42
|
+
Deployed machine learning models often degrade silently over time due to changing data distributions, while ground-truth labels are unavailable for continuous performance evaluation.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## 🎯 Project Objective
|
|
47
|
+
|
|
48
|
+
Design a **label-free, post-deployment monitoring system** that tracks:
|
|
49
|
+
|
|
50
|
+
- Data distribution shifts
|
|
51
|
+
- Prediction behavior instability
|
|
52
|
+
- Model reliability trends
|
|
53
|
+
|
|
54
|
+
to provide **early warning signals** of degradation **without modifying the deployed model**.
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## 🚫 What This Project Does *Not* Do
|
|
59
|
+
|
|
60
|
+
To maintain clarity of scope, ModelShift-Lite explicitly does **not**:
|
|
61
|
+
|
|
62
|
+
- ❌ Retrain models
|
|
63
|
+
- ❌ Correct predictions
|
|
64
|
+
- ❌ Compute accuracy on production data
|
|
65
|
+
|
|
66
|
+
It focuses solely on **monitoring and interpretability**.
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## 🧠 Core Idea (In Simple Terms)
|
|
71
|
+
|
|
72
|
+
> *If we cannot measure correctness, we can still monitor behavior.*
|
|
73
|
+
|
|
74
|
+
ModelShift-Lite observes how a model **reacts** to changing data and identifies signs of instability before failures become obvious.
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## 🛠️ Key Components
|
|
79
|
+
|
|
80
|
+
- **Reference Baseline Handling**
|
|
81
|
+
Captures normal model behavior from historical or validation data
|
|
82
|
+
|
|
83
|
+
- **Live Inference Monitoring**
|
|
84
|
+
Tracks incoming production data and predictions
|
|
85
|
+
|
|
86
|
+
- **Feature Drift Detection**
|
|
87
|
+
Identifies changes in input distributions
|
|
88
|
+
|
|
89
|
+
- **Prediction Behavior Analysis**
|
|
90
|
+
Monitors confidence, stability, and output distribution shifts
|
|
91
|
+
|
|
92
|
+
- **Model Health Scoring**
|
|
93
|
+
Aggregates drift signals into an interpretable reliability indicator
|
|
94
|
+
|
|
95
|
+
- **Visualization Dashboard**
|
|
96
|
+
Displays trends, drift severity, and degradation warnings
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
Reference Data →
|
|
100
|
+
→ Drift Detection → Health Scoring → Monitoring Dashboard
|
|
101
|
+
Live Inference →
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
*(Detailed architecture diagrams are provided in `/docs`)*
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## 💻 Technology Stack
|
|
109
|
+
|
|
110
|
+
- **Language:** Python
|
|
111
|
+
- **Data Processing:** NumPy, Pandas
|
|
112
|
+
- **Statistical Analysis:** SciPy
|
|
113
|
+
- **Visualization:** Streamlit, Matplotlib
|
|
114
|
+
- **Storage:** SQLite (local, replaceable)
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## 📂 Repository Structure
|
|
119
|
+
|
|
120
|
+
```text
|
|
121
|
+
modelshift-lite/
|
|
122
|
+
├── modelshift/ # Core monitoring logic
|
|
123
|
+
├── dashboard/ # Streamlit visualization app
|
|
124
|
+
├── experiments/ # Drift simulation & analysis
|
|
125
|
+
├── data/ # Reference & live data
|
|
126
|
+
├── docs/ # Architecture and design docs
|
|
127
|
+
└── README.md
|
|
128
|
+
## 🏗️ High-Level Architecture
|
|
129
|
+
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# 🚦 ModelShift-Lite
|
|
2
|
+
### Label-Free Monitoring for Deployed Machine Learning Models
|
|
3
|
+
|
|
4
|
+
> A lightweight, behavior-centric system to detect **silent reliability degradation** in deployed machine learning models — without requiring ground-truth labels.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## 📌 Why ModelShift-Lite?
|
|
9
|
+
|
|
10
|
+
Machine learning models rarely fail loudly after deployment.
|
|
11
|
+
Instead, they **silently degrade** as real-world data changes — while true labels are unavailable for continuous evaluation.
|
|
12
|
+
|
|
13
|
+
**ModelShift-Lite addresses this blind spot.**
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## 🧩 Problem Statement
|
|
18
|
+
|
|
19
|
+
Deployed machine learning models often degrade silently over time due to changing data distributions, while ground-truth labels are unavailable for continuous performance evaluation.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 🎯 Project Objective
|
|
24
|
+
|
|
25
|
+
Design a **label-free, post-deployment monitoring system** that tracks:
|
|
26
|
+
|
|
27
|
+
- Data distribution shifts
|
|
28
|
+
- Prediction behavior instability
|
|
29
|
+
- Model reliability trends
|
|
30
|
+
|
|
31
|
+
to provide **early warning signals** of degradation **without modifying the deployed model**.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## 🚫 What This Project Does *Not* Do
|
|
36
|
+
|
|
37
|
+
To maintain clarity of scope, ModelShift-Lite explicitly does **not**:
|
|
38
|
+
|
|
39
|
+
- ❌ Retrain models
|
|
40
|
+
- ❌ Correct predictions
|
|
41
|
+
- ❌ Compute accuracy on production data
|
|
42
|
+
|
|
43
|
+
It focuses solely on **monitoring and interpretability**.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## 🧠 Core Idea (In Simple Terms)
|
|
48
|
+
|
|
49
|
+
> *If we cannot measure correctness, we can still monitor behavior.*
|
|
50
|
+
|
|
51
|
+
ModelShift-Lite observes how a model **reacts** to changing data and identifies signs of instability before failures become obvious.
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## 🛠️ Key Components
|
|
56
|
+
|
|
57
|
+
- **Reference Baseline Handling**
|
|
58
|
+
Captures normal model behavior from historical or validation data
|
|
59
|
+
|
|
60
|
+
- **Live Inference Monitoring**
|
|
61
|
+
Tracks incoming production data and predictions
|
|
62
|
+
|
|
63
|
+
- **Feature Drift Detection**
|
|
64
|
+
Identifies changes in input distributions
|
|
65
|
+
|
|
66
|
+
- **Prediction Behavior Analysis**
|
|
67
|
+
Monitors confidence, stability, and output distribution shifts
|
|
68
|
+
|
|
69
|
+
- **Model Health Scoring**
|
|
70
|
+
Aggregates drift signals into an interpretable reliability indicator
|
|
71
|
+
|
|
72
|
+
- **Visualization Dashboard**
|
|
73
|
+
Displays trends, drift severity, and degradation warnings
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
Reference Data →
|
|
77
|
+
→ Drift Detection → Health Scoring → Monitoring Dashboard
|
|
78
|
+
Live Inference →
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
*(Detailed architecture diagrams are provided in `/docs`)*
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## 💻 Technology Stack
|
|
86
|
+
|
|
87
|
+
- **Language:** Python
|
|
88
|
+
- **Data Processing:** NumPy, Pandas
|
|
89
|
+
- **Statistical Analysis:** SciPy
|
|
90
|
+
- **Visualization:** Streamlit, Matplotlib
|
|
91
|
+
- **Storage:** SQLite (local, replaceable)
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## 📂 Repository Structure
|
|
96
|
+
|
|
97
|
+
```text
|
|
98
|
+
modelshift-lite/
|
|
99
|
+
├── modelshift/ # Core monitoring logic
|
|
100
|
+
├── dashboard/ # Streamlit visualization app
|
|
101
|
+
├── experiments/ # Drift simulation & analysis
|
|
102
|
+
├── data/ # Reference & live data
|
|
103
|
+
├── docs/ # Architecture and design docs
|
|
104
|
+
└── README.md
|
|
105
|
+
## 🏗️ High-Level Architecture
|
|
106
|
+
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BaselineWindow:
|
|
5
|
+
"""
|
|
6
|
+
Stores and manages reference baseline data
|
|
7
|
+
representing normal model behavior.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def __init__(self, data: pd.DataFrame):
|
|
11
|
+
self._validate(data)
|
|
12
|
+
self.data = data.copy()
|
|
13
|
+
self.feature_names = list(data.columns)
|
|
14
|
+
self.num_samples = len(data)
|
|
15
|
+
|
|
16
|
+
def _validate(self, data):
|
|
17
|
+
if not isinstance(data, pd.DataFrame):
|
|
18
|
+
raise TypeError("Baseline data must be a pandas DataFrame")
|
|
19
|
+
|
|
20
|
+
if data.empty:
|
|
21
|
+
raise ValueError("Baseline data cannot be empty")
|
|
22
|
+
|
|
23
|
+
def get_data(self) -> pd.DataFrame:
|
|
24
|
+
"""
|
|
25
|
+
Returns a copy of baseline data.
|
|
26
|
+
"""
|
|
27
|
+
return self.data.copy()
|
|
28
|
+
|
|
29
|
+
def summary(self) -> dict:
|
|
30
|
+
"""
|
|
31
|
+
Returns basic metadata about the baseline window.
|
|
32
|
+
"""
|
|
33
|
+
return {
|
|
34
|
+
"num_samples": self.num_samples,
|
|
35
|
+
"num_features": len(self.feature_names),
|
|
36
|
+
"feature_names": self.feature_names,
|
|
37
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from scipy.stats import ks_2samp
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def compute_feature_drift(
|
|
6
|
+
baseline_data: pd.DataFrame,
|
|
7
|
+
live_data: pd.DataFrame
|
|
8
|
+
) -> dict:
|
|
9
|
+
"""
|
|
10
|
+
Compute feature-level drift using the Kolmogorov–Smirnov test.
|
|
11
|
+
|
|
12
|
+
Returns a dictionary:
|
|
13
|
+
{
|
|
14
|
+
feature_name: {
|
|
15
|
+
"ks_statistic": float,
|
|
16
|
+
"p_value": float
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
_validate_inputs(baseline_data, live_data)
|
|
22
|
+
|
|
23
|
+
drift_results = {}
|
|
24
|
+
|
|
25
|
+
for feature in baseline_data.columns:
|
|
26
|
+
baseline_values = baseline_data[feature].dropna()
|
|
27
|
+
live_values = live_data[feature].dropna()
|
|
28
|
+
|
|
29
|
+
ks_stat, p_value = ks_2samp(baseline_values, live_values)
|
|
30
|
+
|
|
31
|
+
drift_results[feature] = {
|
|
32
|
+
"ks_statistic": float(ks_stat),
|
|
33
|
+
"p_value": float(p_value),
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return drift_results
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _validate_inputs(baseline_data, live_data):
|
|
40
|
+
if not isinstance(baseline_data, pd.DataFrame):
|
|
41
|
+
raise TypeError("Baseline data must be a pandas DataFrame")
|
|
42
|
+
|
|
43
|
+
if not isinstance(live_data, pd.DataFrame):
|
|
44
|
+
raise TypeError("Live data must be a pandas DataFrame")
|
|
45
|
+
|
|
46
|
+
if baseline_data.empty or live_data.empty:
|
|
47
|
+
raise ValueError("Baseline and live data cannot be empty")
|
|
48
|
+
|
|
49
|
+
if list(baseline_data.columns) != list(live_data.columns):
|
|
50
|
+
raise ValueError("Baseline and live data must have identical features")
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from scipy.stats import ks_2samp
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def compute_prediction_drift(
|
|
8
|
+
baseline_predictions: np.ndarray,
|
|
9
|
+
live_predictions: np.ndarray
|
|
10
|
+
) -> dict:
|
|
11
|
+
"""
|
|
12
|
+
Compute prediction behavior drift using:
|
|
13
|
+
1) KS-test on prediction probability distributions
|
|
14
|
+
2) Binary entropy change (mean entropy of predicted probabilities)
|
|
15
|
+
|
|
16
|
+
Notes:
|
|
17
|
+
- Expects 1D probability arrays (values in [0, 1]).
|
|
18
|
+
- Uses full binary entropy: -(p*log(p) + (1-p)*log(1-p))
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
baseline = _prepare_predictions("baseline", baseline_predictions)
|
|
22
|
+
live = _prepare_predictions("live", live_predictions)
|
|
23
|
+
|
|
24
|
+
# KS-test on prediction distributions
|
|
25
|
+
ks_stat, p_value = ks_2samp(baseline, live)
|
|
26
|
+
|
|
27
|
+
# Entropy analysis (full binary entropy)
|
|
28
|
+
baseline_entropy = _binary_entropy_mean(baseline)
|
|
29
|
+
live_entropy = _binary_entropy_mean(live)
|
|
30
|
+
entropy_change = live_entropy - baseline_entropy
|
|
31
|
+
|
|
32
|
+
# Lightweight shape/center diagnostics (useful for dashboards/reports)
|
|
33
|
+
baseline_mean = float(np.mean(baseline))
|
|
34
|
+
live_mean = float(np.mean(live))
|
|
35
|
+
baseline_std = float(np.std(baseline))
|
|
36
|
+
live_std = float(np.std(live))
|
|
37
|
+
baseline_median = float(np.median(baseline))
|
|
38
|
+
live_median = float(np.median(live))
|
|
39
|
+
|
|
40
|
+
return {
|
|
41
|
+
"ks_statistic": float(ks_stat),
|
|
42
|
+
"p_value": float(p_value),
|
|
43
|
+
|
|
44
|
+
"baseline_entropy": round(float(baseline_entropy), 6),
|
|
45
|
+
"live_entropy": round(float(live_entropy), 6),
|
|
46
|
+
"entropy_change": round(float(entropy_change), 6),
|
|
47
|
+
"abs_entropy_change": round(float(abs(entropy_change)), 6),
|
|
48
|
+
|
|
49
|
+
"baseline_mean_prob": round(baseline_mean, 6),
|
|
50
|
+
"live_mean_prob": round(live_mean, 6),
|
|
51
|
+
"mean_prob_shift": round(float(live_mean - baseline_mean), 6),
|
|
52
|
+
|
|
53
|
+
"baseline_median_prob": round(baseline_median, 6),
|
|
54
|
+
"live_median_prob": round(live_median, 6),
|
|
55
|
+
"median_prob_shift": round(float(live_median - baseline_median), 6),
|
|
56
|
+
|
|
57
|
+
"baseline_std_prob": round(baseline_std, 6),
|
|
58
|
+
"live_std_prob": round(live_std, 6),
|
|
59
|
+
"std_prob_shift": round(float(live_std - baseline_std), 6),
|
|
60
|
+
|
|
61
|
+
"n_baseline": int(baseline.size),
|
|
62
|
+
"n_live": int(live.size),
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _binary_entropy_mean(preds: np.ndarray) -> float:
|
|
67
|
+
"""
|
|
68
|
+
Mean binary entropy for probability predictions:
|
|
69
|
+
H(p) = -(p*log(p) + (1-p)*log(1-p))
|
|
70
|
+
|
|
71
|
+
Uses natural log (nats).
|
|
72
|
+
"""
|
|
73
|
+
eps = 1e-9
|
|
74
|
+
p = np.clip(preds.astype(float), eps, 1.0 - eps)
|
|
75
|
+
entropy = -(p * np.log(p) + (1.0 - p) * np.log(1.0 - p))
|
|
76
|
+
return float(np.mean(entropy))
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _prepare_predictions(name: str, arr) -> np.ndarray:
|
|
80
|
+
"""
|
|
81
|
+
Validate and normalize prediction arrays to a clean 1D float numpy array.
|
|
82
|
+
"""
|
|
83
|
+
if arr is None:
|
|
84
|
+
raise ValueError(f"{name.capitalize()} predictions cannot be None")
|
|
85
|
+
|
|
86
|
+
if not isinstance(arr, np.ndarray):
|
|
87
|
+
# Allow lists/Series while staying user-friendly
|
|
88
|
+
try:
|
|
89
|
+
arr = np.asarray(arr, dtype=float)
|
|
90
|
+
except Exception as exc:
|
|
91
|
+
raise TypeError(
|
|
92
|
+
f"{name.capitalize()} predictions must be a numpy array or array-like of numeric values"
|
|
93
|
+
) from exc
|
|
94
|
+
else:
|
|
95
|
+
arr = arr.astype(float, copy=False)
|
|
96
|
+
|
|
97
|
+
arr = np.ravel(arr)
|
|
98
|
+
|
|
99
|
+
if arr.size == 0:
|
|
100
|
+
raise ValueError(f"{name.capitalize()} prediction array cannot be empty")
|
|
101
|
+
|
|
102
|
+
if not np.all(np.isfinite(arr)):
|
|
103
|
+
raise ValueError(f"{name.capitalize()} predictions contain NaN/Inf values")
|
|
104
|
+
|
|
105
|
+
# We treat these as probability predictions for entropy-based drift
|
|
106
|
+
if np.min(arr) < 0.0 or np.max(arr) > 1.0:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"{name.capitalize()} predictions must be probability values in [0, 1]"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return arr
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# ----------------------------
|
|
7
|
+
# Basic per-signal thresholds
|
|
8
|
+
# ----------------------------
|
|
9
|
+
FEATURE_KS_LOW = 0.10
|
|
10
|
+
FEATURE_KS_MEDIUM = 0.20
|
|
11
|
+
FEATURE_KS_HIGH = 0.35
|
|
12
|
+
|
|
13
|
+
PRED_KS_WARNING = 0.10
|
|
14
|
+
PRED_KS_CRITICAL = 0.15
|
|
15
|
+
|
|
16
|
+
ENTROPY_DELTA_WARNING = 0.01
|
|
17
|
+
ENTROPY_DELTA_CRITICAL = 0.02
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def classify_severity(ks_statistic: float) -> str:
|
|
21
|
+
"""
|
|
22
|
+
Classify severity from a KS-like drift signal.
|
|
23
|
+
|
|
24
|
+
Returns one of:
|
|
25
|
+
LOW / MEDIUM / HIGH / CRITICAL
|
|
26
|
+
"""
|
|
27
|
+
ks = _safe_float(ks_statistic, default=0.0)
|
|
28
|
+
if ks < FEATURE_KS_LOW:
|
|
29
|
+
return "LOW"
|
|
30
|
+
if ks < FEATURE_KS_MEDIUM:
|
|
31
|
+
return "MEDIUM"
|
|
32
|
+
if ks < FEATURE_KS_HIGH:
|
|
33
|
+
return "HIGH"
|
|
34
|
+
return "CRITICAL"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def compute_health_score(feature_drift_results: dict) -> float:
|
|
38
|
+
"""
|
|
39
|
+
Compute overall model health score (0–100) from feature drift.
|
|
40
|
+
Higher score = healthier model.
|
|
41
|
+
|
|
42
|
+
Uses average feature KS:
|
|
43
|
+
health = max(0, 100 * (1 - avg_ks))
|
|
44
|
+
"""
|
|
45
|
+
summary = summarize_feature_drift(feature_drift_results)
|
|
46
|
+
if summary["feature_count"] == 0:
|
|
47
|
+
raise ValueError("Feature drift results cannot be empty")
|
|
48
|
+
|
|
49
|
+
avg_ks = summary["avg_ks"]
|
|
50
|
+
health_score = max(0.0, 100.0 * (1.0 - avg_ks))
|
|
51
|
+
return round(float(health_score), 2)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def summarize_feature_drift(feature_drift_results: Optional[dict]) -> Dict[str, Any]:
|
|
55
|
+
"""
|
|
56
|
+
Extract feature drift summary stats from feature_drift_results.
|
|
57
|
+
Safe against missing/malformed values.
|
|
58
|
+
"""
|
|
59
|
+
if not isinstance(feature_drift_results, dict):
|
|
60
|
+
return {
|
|
61
|
+
"feature_count": 0,
|
|
62
|
+
"avg_ks": 0.0,
|
|
63
|
+
"max_ks": 0.0,
|
|
64
|
+
"max_feature": None,
|
|
65
|
+
"ks_values": [],
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
ks_pairs: List[tuple[str, float]] = []
|
|
69
|
+
for feature, values in feature_drift_results.items():
|
|
70
|
+
if not isinstance(values, dict):
|
|
71
|
+
continue
|
|
72
|
+
ks = _safe_float(values.get("ks_statistic"), default=None)
|
|
73
|
+
if ks is None:
|
|
74
|
+
continue
|
|
75
|
+
ks_pairs.append((str(feature), ks))
|
|
76
|
+
|
|
77
|
+
if not ks_pairs:
|
|
78
|
+
return {
|
|
79
|
+
"feature_count": 0,
|
|
80
|
+
"avg_ks": 0.0,
|
|
81
|
+
"max_ks": 0.0,
|
|
82
|
+
"max_feature": None,
|
|
83
|
+
"ks_values": [],
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
ks_values = [ks for _, ks in ks_pairs]
|
|
87
|
+
max_feature, max_ks = max(ks_pairs, key=lambda x: x[1])
|
|
88
|
+
|
|
89
|
+
return {
|
|
90
|
+
"feature_count": len(ks_values),
|
|
91
|
+
"avg_ks": round(sum(ks_values) / len(ks_values), 6),
|
|
92
|
+
"max_ks": round(float(max_ks), 6),
|
|
93
|
+
"max_feature": max_feature,
|
|
94
|
+
"ks_values": [round(float(v), 6) for v in ks_values],
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def classify_drift_taxonomy(
|
|
99
|
+
feature_drift_results: Optional[dict] = None,
|
|
100
|
+
prediction_drift_results: Optional[dict] = None,
|
|
101
|
+
) -> str:
|
|
102
|
+
"""
|
|
103
|
+
Agreement/disagreement taxonomy between feature drift and prediction drift.
|
|
104
|
+
|
|
105
|
+
Returns one of:
|
|
106
|
+
STABLE
|
|
107
|
+
ROBUST_SHIFT (feature drift high, prediction drift low)
|
|
108
|
+
SILENT_BEHAVIOR_DRIFT (feature drift low, prediction drift high)
|
|
109
|
+
DEGRADING_DRIFT (both high)
|
|
110
|
+
"""
|
|
111
|
+
f_summary = summarize_feature_drift(feature_drift_results)
|
|
112
|
+
max_feature_ks = _safe_float(f_summary.get("max_ks"), default=0.0)
|
|
113
|
+
|
|
114
|
+
pred_ks = 0.0
|
|
115
|
+
if isinstance(prediction_drift_results, dict):
|
|
116
|
+
pred_ks = _safe_float(prediction_drift_results.get("ks_statistic"), default=0.0)
|
|
117
|
+
|
|
118
|
+
feature_high = max_feature_ks >= FEATURE_KS_MEDIUM
|
|
119
|
+
pred_high = pred_ks >= PRED_KS_WARNING
|
|
120
|
+
|
|
121
|
+
if not feature_high and not pred_high:
|
|
122
|
+
return "STABLE"
|
|
123
|
+
if feature_high and not pred_high:
|
|
124
|
+
return "ROBUST_SHIFT"
|
|
125
|
+
if (not feature_high) and pred_high:
|
|
126
|
+
return "SILENT_BEHAVIOR_DRIFT"
|
|
127
|
+
return "DEGRADING_DRIFT"
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def evaluate_drift_state(
|
|
131
|
+
feature_drift_results: Optional[dict] = None,
|
|
132
|
+
prediction_drift_results: Optional[dict] = None,
|
|
133
|
+
) -> Dict[str, Any]:
|
|
134
|
+
"""
|
|
135
|
+
Composite severity + status decision engine.
|
|
136
|
+
|
|
137
|
+
Combines:
|
|
138
|
+
- avg feature KS
|
|
139
|
+
- max feature KS
|
|
140
|
+
- prediction KS
|
|
141
|
+
- entropy delta
|
|
142
|
+
|
|
143
|
+
Returns a normalized decision payload with:
|
|
144
|
+
severity, status, taxonomy, health_score, signals, thresholds
|
|
145
|
+
"""
|
|
146
|
+
f_summary = summarize_feature_drift(feature_drift_results)
|
|
147
|
+
|
|
148
|
+
avg_feature_ks = _safe_float(f_summary.get("avg_ks"), default=0.0)
|
|
149
|
+
max_feature_ks = _safe_float(f_summary.get("max_ks"), default=0.0)
|
|
150
|
+
pred_ks = 0.0
|
|
151
|
+
entropy_change = 0.0
|
|
152
|
+
|
|
153
|
+
if isinstance(prediction_drift_results, dict):
|
|
154
|
+
pred_ks = _safe_float(prediction_drift_results.get("ks_statistic"), default=0.0)
|
|
155
|
+
entropy_change = _safe_float(prediction_drift_results.get("entropy_change"), default=0.0)
|
|
156
|
+
|
|
157
|
+
# Normalize signals into 0..1 severity components
|
|
158
|
+
# (Threshold denominators chosen to align with your current observed ranges.)
|
|
159
|
+
avg_comp = min(1.0, avg_feature_ks / FEATURE_KS_MEDIUM) # avg drift
|
|
160
|
+
max_comp = min(1.0, max_feature_ks / FEATURE_KS_HIGH) # worst feature
|
|
161
|
+
pred_comp = min(1.0, pred_ks / PRED_KS_CRITICAL) # behavior drift
|
|
162
|
+
ent_comp = min(1.0, abs(entropy_change) / ENTROPY_DELTA_CRITICAL) # confidence shift
|
|
163
|
+
|
|
164
|
+
# Weighted composite score [0,1]
|
|
165
|
+
composite_score = (
|
|
166
|
+
0.30 * avg_comp +
|
|
167
|
+
0.25 * max_comp +
|
|
168
|
+
0.35 * pred_comp +
|
|
169
|
+
0.10 * ent_comp
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
severity = _classify_composite_severity(composite_score)
|
|
173
|
+
|
|
174
|
+
# Status favors prediction drift a bit more (behavior-centric monitoring)
|
|
175
|
+
if pred_ks >= PRED_KS_CRITICAL or max_feature_ks >= FEATURE_KS_HIGH:
|
|
176
|
+
status = "CRITICAL_DRIFT"
|
|
177
|
+
elif pred_ks >= PRED_KS_WARNING or max_feature_ks >= FEATURE_KS_MEDIUM or avg_feature_ks >= FEATURE_KS_LOW:
|
|
178
|
+
status = "WARNING_DRIFT"
|
|
179
|
+
else:
|
|
180
|
+
status = "STABLE"
|
|
181
|
+
|
|
182
|
+
taxonomy = classify_drift_taxonomy(feature_drift_results, prediction_drift_results)
|
|
183
|
+
|
|
184
|
+
# Health score is only meaningful if feature drift exists
|
|
185
|
+
health_score = None
|
|
186
|
+
if f_summary["feature_count"] > 0:
|
|
187
|
+
health_score = compute_health_score(feature_drift_results)
|
|
188
|
+
|
|
189
|
+
return {
|
|
190
|
+
"severity": severity,
|
|
191
|
+
"status": status,
|
|
192
|
+
"taxonomy": taxonomy,
|
|
193
|
+
"health_score": health_score,
|
|
194
|
+
"signals": {
|
|
195
|
+
"avg_feature_ks": round(avg_feature_ks, 6),
|
|
196
|
+
"max_feature_ks": round(max_feature_ks, 6),
|
|
197
|
+
"max_feature_name": f_summary.get("max_feature"),
|
|
198
|
+
"prediction_ks": round(pred_ks, 6),
|
|
199
|
+
"entropy_change": round(entropy_change, 6),
|
|
200
|
+
"composite_score": round(float(composite_score), 6),
|
|
201
|
+
"feature_count": int(f_summary.get("feature_count", 0)),
|
|
202
|
+
},
|
|
203
|
+
"thresholds": {
|
|
204
|
+
"feature_ks_low": FEATURE_KS_LOW,
|
|
205
|
+
"feature_ks_medium": FEATURE_KS_MEDIUM,
|
|
206
|
+
"feature_ks_high": FEATURE_KS_HIGH,
|
|
207
|
+
"pred_ks_warning": PRED_KS_WARNING,
|
|
208
|
+
"pred_ks_critical": PRED_KS_CRITICAL,
|
|
209
|
+
"entropy_delta_warning": ENTROPY_DELTA_WARNING,
|
|
210
|
+
"entropy_delta_critical": ENTROPY_DELTA_CRITICAL,
|
|
211
|
+
},
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# ----------------------------
|
|
216
|
+
# Internal helpers
|
|
217
|
+
# ----------------------------
|
|
218
|
+
def _classify_composite_severity(score: float) -> str:
|
|
219
|
+
"""
|
|
220
|
+
Composite score is already normalized 0..1.
|
|
221
|
+
"""
|
|
222
|
+
s = max(0.0, min(1.0, _safe_float(score, default=0.0)))
|
|
223
|
+
|
|
224
|
+
if s < 0.20:
|
|
225
|
+
return "LOW"
|
|
226
|
+
if s < 0.45:
|
|
227
|
+
return "MEDIUM"
|
|
228
|
+
if s < 0.70:
|
|
229
|
+
return "HIGH"
|
|
230
|
+
return "CRITICAL"
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _safe_float(value: Any, default: Optional[float] = 0.0) -> Optional[float]:
|
|
234
|
+
try:
|
|
235
|
+
if value is None:
|
|
236
|
+
return default
|
|
237
|
+
return float(value)
|
|
238
|
+
except (TypeError, ValueError):
|
|
239
|
+
return default
|