validmind 2.4.13__py3-none-any.whl → 2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/tests/__types__.py +4 -0
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +15 -6
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +10 -3
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +349 -291
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +1 -1
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -37
- validmind/tests/ongoing_monitoring/FeatureDrift.py +182 -0
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +76 -0
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +91 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +57 -0
- validmind/unit_metrics/classification/sklearn/ROC_AUC.py +22 -1
- validmind/utils.py +1 -1
- validmind/vm_models/dataset/dataset.py +2 -1
- {validmind-2.4.13.dist-info → validmind-2.5.1.dist-info}/METADATA +1 -1
- {validmind-2.4.13.dist-info → validmind-2.5.1.dist-info}/RECORD +20 -16
- {validmind-2.4.13.dist-info → validmind-2.5.1.dist-info}/LICENSE +0 -0
- {validmind-2.4.13.dist-info → validmind-2.5.1.dist-info}/WHEEL +0 -0
- {validmind-2.4.13.dist-info → validmind-2.5.1.dist-info}/entry_points.txt +0 -0
@@ -6,10 +6,12 @@ from dataclasses import dataclass
|
|
6
6
|
from typing import List
|
7
7
|
|
8
8
|
import matplotlib.pyplot as plt
|
9
|
+
import numpy as np
|
9
10
|
import pandas as pd
|
10
11
|
import seaborn as sns
|
11
12
|
from sklearn import metrics
|
12
13
|
|
14
|
+
from validmind.logging import get_logger
|
13
15
|
from validmind.vm_models import (
|
14
16
|
Figure,
|
15
17
|
ResultSummary,
|
@@ -17,321 +19,377 @@ from validmind.vm_models import (
|
|
17
19
|
ResultTableMetadata,
|
18
20
|
ThresholdTest,
|
19
21
|
ThresholdTestResult,
|
22
|
+
VMDataset,
|
23
|
+
VMModel,
|
20
24
|
)
|
21
25
|
|
26
|
+
logger = get_logger(__name__)
|
27
|
+
|
28
|
+
DEFAULT_THRESHOLD = 0.04
|
29
|
+
PERFORMANCE_METRICS = {
|
30
|
+
"accuracy": {
|
31
|
+
"function": metrics.accuracy_score,
|
32
|
+
"is_classification": True,
|
33
|
+
"is_lower_better": False,
|
34
|
+
},
|
35
|
+
"auc": {
|
36
|
+
"function": metrics.roc_auc_score,
|
37
|
+
"is_classification": True,
|
38
|
+
"is_lower_better": False,
|
39
|
+
},
|
40
|
+
"f1": {
|
41
|
+
"function": metrics.f1_score,
|
42
|
+
"is_classification": True,
|
43
|
+
"is_lower_better": False,
|
44
|
+
},
|
45
|
+
"precision": {
|
46
|
+
"function": metrics.precision_score,
|
47
|
+
"is_classification": True,
|
48
|
+
"is_lower_better": False,
|
49
|
+
},
|
50
|
+
"recall": {
|
51
|
+
"function": metrics.recall_score,
|
52
|
+
"is_classification": True,
|
53
|
+
"is_lower_better": False,
|
54
|
+
},
|
55
|
+
"mse": {
|
56
|
+
"function": metrics.mean_squared_error,
|
57
|
+
"is_classification": False,
|
58
|
+
"is_lower_better": True,
|
59
|
+
},
|
60
|
+
"mae": {
|
61
|
+
"function": metrics.mean_absolute_error,
|
62
|
+
"is_classification": False,
|
63
|
+
"is_lower_better": True,
|
64
|
+
},
|
65
|
+
"r2": {
|
66
|
+
"function": metrics.r2_score,
|
67
|
+
"is_classification": False,
|
68
|
+
"is_lower_better": False,
|
69
|
+
},
|
70
|
+
"mape": {
|
71
|
+
"function": metrics.mean_absolute_percentage_error,
|
72
|
+
"is_classification": False,
|
73
|
+
"is_lower_better": True,
|
74
|
+
},
|
75
|
+
}
|
76
|
+
|
77
|
+
|
78
|
+
def _prepare_results(
|
79
|
+
results_train: dict, results_test: dict, metric: str
|
80
|
+
) -> pd.DataFrame:
|
81
|
+
results_train = pd.DataFrame(results_train)
|
82
|
+
results_test = pd.DataFrame(results_test)
|
83
|
+
results = results_train.copy()
|
84
|
+
results.rename(
|
85
|
+
columns={"shape": "training records", f"{metric}": f"training {metric}"},
|
86
|
+
inplace=True,
|
87
|
+
)
|
88
|
+
results[f"test {metric}"] = results_test[metric]
|
89
|
+
|
90
|
+
# Adjust gap calculation based on metric directionality
|
91
|
+
if PERFORMANCE_METRICS[metric]["is_lower_better"]:
|
92
|
+
results["gap"] = results[f"test {metric}"] - results[f"training {metric}"]
|
93
|
+
else:
|
94
|
+
results["gap"] = results[f"training {metric}"] - results[f"test {metric}"]
|
95
|
+
|
96
|
+
return results
|
97
|
+
|
98
|
+
|
99
|
+
def _compute_metrics(
|
100
|
+
results: dict,
|
101
|
+
region: str,
|
102
|
+
df_region: pd.DataFrame,
|
103
|
+
target_column: str,
|
104
|
+
prob_column: str,
|
105
|
+
pred_column: str,
|
106
|
+
feature_column: str,
|
107
|
+
metric: str,
|
108
|
+
is_classification: bool,
|
109
|
+
) -> None:
|
110
|
+
results["slice"].append(str(region))
|
111
|
+
results["shape"].append(df_region.shape[0])
|
112
|
+
results["feature"].append(feature_column)
|
113
|
+
|
114
|
+
# Check if any records
|
115
|
+
if df_region.empty:
|
116
|
+
results[metric].append(0)
|
117
|
+
return
|
118
|
+
|
119
|
+
metric_func = PERFORMANCE_METRICS[metric]["function"]
|
120
|
+
y_true = df_region[target_column].values
|
121
|
+
|
122
|
+
# AUC requires probability scores
|
123
|
+
if is_classification and metric == "auc":
|
124
|
+
# if only one class is present in the data, return 0
|
125
|
+
if len(np.unique(y_true)) == 1:
|
126
|
+
results[metric].append(0)
|
127
|
+
return
|
22
128
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
129
|
+
score = metric_func(y_true, df_region[prob_column].values)
|
130
|
+
|
131
|
+
# All other classification metrics
|
132
|
+
elif is_classification:
|
133
|
+
score = metric_func(y_true, df_region[pred_column].values)
|
134
|
+
|
135
|
+
# Regression metrics
|
136
|
+
else:
|
137
|
+
score = metric_func(y_true, df_region[pred_column].values)
|
138
|
+
|
139
|
+
results[metric].append(score)
|
140
|
+
|
141
|
+
|
142
|
+
def _plot_overfit_regions(
|
143
|
+
df: pd.DataFrame, feature_column: str, threshold: float, metric: str
|
144
|
+
) -> plt.Figure:
|
145
|
+
fig, ax = plt.subplots()
|
146
|
+
barplot = sns.barplot(data=df, x="slice", y="gap", ax=ax)
|
147
|
+
ax.tick_params(axis="x", rotation=90)
|
148
|
+
|
149
|
+
# Draw threshold line
|
150
|
+
axhline = ax.axhline(
|
151
|
+
y=threshold,
|
152
|
+
color="red",
|
153
|
+
linestyle="--",
|
154
|
+
linewidth=1,
|
155
|
+
label=f"Cut-Off Threshold: {threshold}",
|
156
|
+
)
|
157
|
+
ax.tick_params(axis="x", labelsize=20)
|
158
|
+
ax.tick_params(axis="y", labelsize=20)
|
159
|
+
|
160
|
+
ax.set_ylabel(f"{metric.upper()} Gap", weight="bold", fontsize=18)
|
161
|
+
ax.set_xlabel("Slice/Segments", weight="bold", fontsize=18)
|
162
|
+
ax.set_title(
|
163
|
+
f"Overfit regions in feature column: {feature_column}",
|
164
|
+
weight="bold",
|
165
|
+
fontsize=20,
|
166
|
+
wrap=True,
|
167
|
+
)
|
168
|
+
|
169
|
+
handles, labels = barplot.get_legend_handles_labels()
|
170
|
+
handles.append(axhline)
|
171
|
+
labels.append(axhline.get_label())
|
172
|
+
|
173
|
+
barplot.legend(
|
174
|
+
handles=handles[:-1],
|
175
|
+
labels=labels,
|
176
|
+
loc="upper center",
|
177
|
+
bbox_to_anchor=(0.5, 0.1),
|
178
|
+
ncol=len(handles),
|
179
|
+
)
|
180
|
+
|
181
|
+
plt.close("all")
|
182
|
+
|
183
|
+
return fig
|
184
|
+
|
185
|
+
|
186
|
+
# TODO: make this a functional test instead of class-based when appropriate
|
187
|
+
# simply have to remove the class and rename this func to OverfitDiagnosis
|
188
|
+
def overfit_diagnosis( # noqa: C901
|
189
|
+
model: VMModel,
|
190
|
+
datasets: List[VMDataset],
|
191
|
+
metric: str = None,
|
192
|
+
cut_off_threshold: float = DEFAULT_THRESHOLD,
|
193
|
+
):
|
194
|
+
"""Identify overfit regions in a model's predictions.
|
195
|
+
|
196
|
+
This test compares the model's performance on training versus test data, grouped by
|
197
|
+
feature columns. It calculates the difference between the training and test performance
|
198
|
+
for each group and identifies regions where the difference exceeds a specified threshold.
|
199
|
+
|
200
|
+
This test works for both classification and regression models and with a variety of
|
201
|
+
performance metrics. By default, it uses the AUC metric for classification models and
|
202
|
+
the MSE metric for regression models. The threshold for identifying overfit regions
|
203
|
+
defaults to 0.04 but should be adjusted based on the specific use case.
|
204
|
+
|
205
|
+
## Inputs
|
206
|
+
- `model` (VMModel): The ValidMind model object to evaluate.
|
207
|
+
- `datasets` (List[VMDataset]): A list of two VMDataset objects where the first dataset
|
208
|
+
is the training data and the second dataset is the test data.
|
209
|
+
|
210
|
+
## Parameters
|
211
|
+
- `metric` (str, optional): The performance metric to use for evaluation. Choose from:
|
212
|
+
'accuracy', 'auc', 'f1', 'precision', 'recall', 'mse', 'mae', 'r2', 'mape'.
|
213
|
+
Defaults to 'auc' for classification models and 'mse' for regression models.
|
214
|
+
- `cut_off_threshold` (float, optional): The threshold for identifying overfit regions.
|
215
|
+
Defaults to 0.04.
|
65
216
|
"""
|
66
217
|
|
67
|
-
|
68
|
-
|
69
|
-
default_params = {"features_columns": None, "cut_off_percentage": 4}
|
70
|
-
tasks = ["classification", "text_classification"]
|
71
|
-
tags = [
|
72
|
-
"sklearn",
|
73
|
-
"binary_classification",
|
74
|
-
"multiclass_classification",
|
75
|
-
"model_diagnosis",
|
76
|
-
]
|
77
|
-
|
78
|
-
default_metrics = {
|
79
|
-
"accuracy": metrics.accuracy_score,
|
80
|
-
}
|
218
|
+
# Determine if it's a classification or regression model
|
219
|
+
is_classification = bool(datasets[0].probability_column(model))
|
81
220
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
221
|
+
# Set default metric if not provided
|
222
|
+
if metric is None:
|
223
|
+
metric = "auc" if is_classification else "mse"
|
224
|
+
logger.info(
|
225
|
+
f"Using default {'classification' if is_classification else 'regression'} metric: {metric}"
|
226
|
+
)
|
86
227
|
|
87
|
-
|
88
|
-
|
228
|
+
if id(cut_off_threshold) == id(DEFAULT_THRESHOLD):
|
229
|
+
logger.info("Using default cut-off threshold of 0.04")
|
89
230
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
231
|
+
metric = metric.lower()
|
232
|
+
try:
|
233
|
+
_metric = PERFORMANCE_METRICS[metric.lower()]
|
234
|
+
except KeyError:
|
235
|
+
raise ValueError(
|
236
|
+
f"Invalid metric. Choose from: {', '.join(PERFORMANCE_METRICS.keys())}"
|
237
|
+
)
|
94
238
|
|
95
|
-
|
96
|
-
|
97
|
-
|
239
|
+
if is_classification and not _metric["is_classification"]:
|
240
|
+
raise ValueError(f"Cannot use regression metric ({metric}) for classification.")
|
241
|
+
elif not is_classification and _metric["is_classification"]:
|
242
|
+
raise ValueError(f"Cannot use classification metric ({metric}) for regression.")
|
243
|
+
|
244
|
+
train_df = datasets[0].df
|
245
|
+
test_df = datasets[1].df
|
246
|
+
|
247
|
+
pred_column = f"{datasets[0].target_column}_pred"
|
248
|
+
prob_column = f"{datasets[0].target_column}_prob"
|
249
|
+
|
250
|
+
train_df[pred_column] = datasets[0].y_pred(model)
|
251
|
+
test_df[pred_column] = datasets[1].y_pred(model)
|
252
|
+
|
253
|
+
if is_classification:
|
254
|
+
train_df[prob_column] = datasets[0].y_prob(model)
|
255
|
+
test_df[prob_column] = datasets[1].y_prob(model)
|
256
|
+
|
257
|
+
test_results = []
|
258
|
+
test_figures = []
|
259
|
+
results_headers = ["slice", "shape", "feature", metric]
|
260
|
+
|
261
|
+
for feature_column in datasets[0].feature_columns:
|
262
|
+
bins = 10
|
263
|
+
if feature_column in datasets[0].feature_columns_categorical:
|
264
|
+
bins = len(train_df[feature_column].unique())
|
265
|
+
train_df["bin"] = pd.cut(train_df[feature_column], bins=bins)
|
266
|
+
|
267
|
+
results_train = {k: [] for k in results_headers}
|
268
|
+
results_test = {k: [] for k in results_headers}
|
269
|
+
|
270
|
+
for region, df_region in train_df.groupby("bin"):
|
271
|
+
_compute_metrics(
|
272
|
+
results=results_train,
|
273
|
+
region=region,
|
274
|
+
df_region=df_region,
|
275
|
+
feature_column=feature_column,
|
276
|
+
target_column=datasets[0].target_column,
|
277
|
+
prob_column=prob_column,
|
278
|
+
pred_column=pred_column,
|
279
|
+
metric=metric,
|
280
|
+
is_classification=is_classification,
|
281
|
+
)
|
282
|
+
df_test_region = test_df[
|
283
|
+
(test_df[feature_column] > region.left)
|
284
|
+
& (test_df[feature_column] <= region.right)
|
285
|
+
]
|
286
|
+
_compute_metrics(
|
287
|
+
results=results_test,
|
288
|
+
region=region,
|
289
|
+
df_region=df_test_region,
|
290
|
+
feature_column=feature_column,
|
291
|
+
target_column=datasets[1].target_column,
|
292
|
+
prob_column=prob_column,
|
293
|
+
pred_column=pred_column,
|
294
|
+
metric=metric,
|
295
|
+
is_classification=is_classification,
|
98
296
|
)
|
99
297
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
298
|
+
results = _prepare_results(results_train, results_test, metric)
|
299
|
+
|
300
|
+
fig = _plot_overfit_regions(results, feature_column, cut_off_threshold, metric)
|
301
|
+
test_figures.append(
|
302
|
+
Figure(
|
303
|
+
key=f"overfit_diagnosis:{metric}:{feature_column}",
|
304
|
+
figure=fig,
|
305
|
+
metadata={
|
306
|
+
"metric": metric,
|
307
|
+
"cut_off_threshold": cut_off_threshold,
|
308
|
+
"feature": feature_column,
|
309
|
+
},
|
107
310
|
)
|
311
|
+
)
|
108
312
|
|
109
|
-
|
110
|
-
|
111
|
-
|
313
|
+
for _, row in results[results["gap"] > cut_off_threshold].iterrows():
|
314
|
+
test_results.append(
|
315
|
+
{
|
316
|
+
"Feature": feature_column,
|
317
|
+
"Slice": row["slice"],
|
318
|
+
"Number of Records": row["training records"],
|
319
|
+
f"Training {metric.upper()}": row[f"training {metric}"],
|
320
|
+
f"Test {metric.upper()}": row[f"test {metric}"],
|
321
|
+
"Gap": row["gap"],
|
322
|
+
}
|
112
323
|
)
|
113
324
|
|
114
|
-
|
115
|
-
prediction_column = f"{target_column}_pred"
|
116
|
-
|
117
|
-
# Add prediction column in the training dataset
|
118
|
-
train_df = self.inputs.datasets[0].df.copy()
|
119
|
-
train_class_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
|
120
|
-
train_df[prediction_column] = train_class_pred
|
121
|
-
|
122
|
-
# Add prediction column in the test dataset
|
123
|
-
test_df = self.inputs.datasets[1].df.copy()
|
124
|
-
test_class_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
|
125
|
-
test_df[prediction_column] = test_class_pred
|
126
|
-
|
127
|
-
test_results = []
|
128
|
-
test_figures = []
|
129
|
-
results_headers = ["slice", "shape", "feature"]
|
130
|
-
results_headers.extend(self.default_metrics.keys())
|
131
|
-
|
132
|
-
for feature_column in features_list:
|
133
|
-
bins = 10
|
134
|
-
if feature_column in self.inputs.datasets[0].feature_columns_categorical:
|
135
|
-
bins = len(train_df[feature_column].unique())
|
136
|
-
train_df["bin"] = pd.cut(train_df[feature_column], bins=bins)
|
137
|
-
|
138
|
-
results_train = {k: [] for k in results_headers}
|
139
|
-
results_test = {k: [] for k in results_headers}
|
140
|
-
|
141
|
-
for region, df_region in train_df.groupby("bin"):
|
142
|
-
self._compute_metrics(
|
143
|
-
results_train,
|
144
|
-
region,
|
145
|
-
df_region,
|
146
|
-
target_column,
|
147
|
-
prediction_column,
|
148
|
-
feature_column,
|
149
|
-
)
|
150
|
-
df_test_region = test_df[
|
151
|
-
(test_df[feature_column] > region.left)
|
152
|
-
& (test_df[feature_column] <= region.right)
|
153
|
-
]
|
154
|
-
self._compute_metrics(
|
155
|
-
results_test,
|
156
|
-
region,
|
157
|
-
df_test_region,
|
158
|
-
target_column,
|
159
|
-
prediction_column,
|
160
|
-
feature_column,
|
161
|
-
)
|
325
|
+
return {"Overfit Diagnosis": test_results}, *test_figures
|
162
326
|
|
163
|
-
results = self._prepare_results(results_train, results_test)
|
164
327
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
328
|
+
@dataclass
|
329
|
+
class OverfitDiagnosis(ThresholdTest):
|
330
|
+
"""Identify overfit regions in a model's predictions.
|
331
|
+
|
332
|
+
This test compares the model's performance on training versus test data, grouped by
|
333
|
+
feature columns. It calculates the difference between the training and test performance
|
334
|
+
for each group and identifies regions where the difference exceeds a specified threshold.
|
335
|
+
|
336
|
+
This test works for both classification and regression models and with a variety of
|
337
|
+
performance metrics. By default, it uses the AUC metric for classification models and
|
338
|
+
the MSE metric for regression models. The threshold for identifying overfit regions
|
339
|
+
defaults to 0.04 but should be adjusted based on the specific use case.
|
340
|
+
|
341
|
+
## Inputs
|
342
|
+
- `model` (VMModel): The ValidMind model object to evaluate.
|
343
|
+
- `datasets` (List[VMDataset]): A list of two VMDataset objects where the first dataset
|
344
|
+
is the training data and the second dataset is the test data.
|
345
|
+
|
346
|
+
## Parameters
|
347
|
+
- `metric` (str, optional): The performance metric to use for evaluation. Choose from:
|
348
|
+
'accuracy', 'auc', 'f1', 'precision', 'recall', 'mse', 'mae', 'r2', 'mape'.
|
349
|
+
Defaults to 'auc' for classification models and 'mse' for regression models.
|
350
|
+
- `cut_off_threshold` (float, optional): The threshold for identifying overfit regions.
|
351
|
+
Defaults to 0.04.
|
352
|
+
"""
|
181
353
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
354
|
+
required_inputs = ["model", "datasets"]
|
355
|
+
default_params = {"metric": None, "cut_off_threshold": DEFAULT_THRESHOLD}
|
356
|
+
tasks = ["classification", "regression"]
|
357
|
+
tags = [
|
358
|
+
"sklearn",
|
359
|
+
"binary_classification",
|
360
|
+
"multiclass_classification",
|
361
|
+
"linear_regression",
|
362
|
+
"model_diagnosis",
|
363
|
+
]
|
364
|
+
|
365
|
+
def run(self):
|
366
|
+
func_result = overfit_diagnosis(
|
367
|
+
self.inputs.model,
|
368
|
+
self.inputs.datasets,
|
369
|
+
metric=self.params["metric"],
|
370
|
+
cut_off_threshold=self.params["cut_off_threshold"],
|
371
|
+
)
|
372
|
+
|
373
|
+
return self.cache_results(
|
374
|
+
test_results_list=[
|
186
375
|
ThresholdTestResult(
|
187
|
-
test_name="
|
188
|
-
column=
|
189
|
-
passed=
|
190
|
-
values={
|
376
|
+
test_name=self.params["metric"],
|
377
|
+
column=row["Feature"],
|
378
|
+
passed=False,
|
379
|
+
values={k: v for k, v in row.items()},
|
191
380
|
)
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
figures=test_figures,
|
381
|
+
for row in func_result[0]["Overfit Diagnosis"]
|
382
|
+
],
|
383
|
+
passed=(not func_result[0]["Overfit Diagnosis"]),
|
384
|
+
figures=func_result[1:],
|
197
385
|
)
|
198
386
|
|
199
|
-
def summary(self, results
|
200
|
-
results_table = [
|
201
|
-
record for result in results for record in result.values["records"]
|
202
|
-
]
|
387
|
+
def summary(self, results, _):
|
203
388
|
return ResultSummary(
|
204
389
|
results=[
|
205
390
|
ResultTable(
|
206
|
-
data=
|
207
|
-
metadata=ResultTableMetadata(title="Overfit
|
391
|
+
data=[result.values for result in results],
|
392
|
+
metadata=ResultTableMetadata(title="Overfit Diagnosis"),
|
208
393
|
)
|
209
|
-
]
|
210
|
-
)
|
211
|
-
|
212
|
-
def _prepare_results(self, results_train: dict, results_test: dict) -> pd.DataFrame:
|
213
|
-
"""
|
214
|
-
Prepares and returns a DataFrame with training and testing results.
|
215
|
-
Args:
|
216
|
-
results_train (dict): A dictionary containing training results.
|
217
|
-
results_test (dict): A dictionary containing testing results.
|
218
|
-
Returns:
|
219
|
-
pd.DataFrame: A DataFrame containing the following columns:
|
220
|
-
- 'shape': The number of training records used.
|
221
|
-
- 'slice': The name of the region being evaluated.
|
222
|
-
- 'training accuracy': The accuracy achieved on the training data (in percentage).
|
223
|
-
- 'test accuracy': The accuracy achieved on the testing data (in percentage).
|
224
|
-
- 'gap': The difference between the training and testing accuracies (in percentage).
|
225
|
-
"""
|
226
|
-
|
227
|
-
results_train = pd.DataFrame(results_train)
|
228
|
-
results_test = pd.DataFrame(results_test)
|
229
|
-
results = results_train.copy()
|
230
|
-
results.rename(
|
231
|
-
columns={"shape": "training records", "accuracy": "training accuracy"},
|
232
|
-
inplace=True,
|
233
|
-
)
|
234
|
-
results["training accuracy"] = results["training accuracy"] * 100
|
235
|
-
results["test accuracy"] = results_test["accuracy"] * 100
|
236
|
-
results["gap"] = results["training accuracy"] - results["test accuracy"]
|
237
|
-
|
238
|
-
return results
|
239
|
-
|
240
|
-
def _compute_metrics(
|
241
|
-
self,
|
242
|
-
results: dict,
|
243
|
-
region: str,
|
244
|
-
df_region: pd.DataFrame,
|
245
|
-
target_column: str,
|
246
|
-
prediction_column: str,
|
247
|
-
feature_column: str,
|
248
|
-
) -> None:
|
249
|
-
"""
|
250
|
-
Computes and appends the evaluation metrics for a given region.
|
251
|
-
Args:
|
252
|
-
results (dict): A dictionary containing the evaluation results for all regions.
|
253
|
-
region (str): The name of the region being evaluated.
|
254
|
-
df_region (pd.DataFrame): The DataFrame containing the region-specific data.
|
255
|
-
target_column (str): The name of the target column in the DataFrame.
|
256
|
-
prediction_column (str): The name of the column containing the model's predictions.
|
257
|
-
Returns:
|
258
|
-
None
|
259
|
-
"""
|
260
|
-
|
261
|
-
results["slice"].append(str(region))
|
262
|
-
results["shape"].append(df_region.shape[0])
|
263
|
-
results["feature"].append(feature_column)
|
264
|
-
|
265
|
-
# Check if df_region is an empty dataframe and if so, append 0 to all metrics
|
266
|
-
if df_region.empty:
|
267
|
-
for metric in self.default_metrics.keys():
|
268
|
-
results[metric].append(0)
|
269
|
-
return
|
270
|
-
|
271
|
-
y_true = df_region[target_column].values
|
272
|
-
y_prediction = (
|
273
|
-
df_region[prediction_column].astype(df_region[target_column].dtypes).values
|
394
|
+
],
|
274
395
|
)
|
275
|
-
|
276
|
-
for metric, metric_fn in self.default_metrics.items():
|
277
|
-
results[metric].append(metric_fn(y_true, y_prediction))
|
278
|
-
|
279
|
-
def _plot_overfit_regions(
|
280
|
-
self, df: pd.DataFrame, feature_column: str, metric: str, threshold: float
|
281
|
-
) -> plt.Figure:
|
282
|
-
"""
|
283
|
-
Plots the overfit regions of a given DataFrame.
|
284
|
-
Args:
|
285
|
-
df (pd.DataFrame): A DataFrame containing the data to plot.
|
286
|
-
feature_column (str): The name of the feature column to plot.
|
287
|
-
threshold (float): The threshold value for the gap, above which a region is considered to be overfit.
|
288
|
-
Returns:
|
289
|
-
plt.Figure: A matplotlib Figure object containing the plot.
|
290
|
-
"""
|
291
|
-
|
292
|
-
# Create a bar plot using seaborn library
|
293
|
-
fig, ax = plt.subplots()
|
294
|
-
barplot = sns.barplot(data=df, x="slice", y="gap", ax=ax)
|
295
|
-
ax.tick_params(axis="x", rotation=90)
|
296
|
-
# Draw threshold line
|
297
|
-
axhline = ax.axhline(
|
298
|
-
y=threshold,
|
299
|
-
color="red",
|
300
|
-
linestyle="--",
|
301
|
-
linewidth=1,
|
302
|
-
label=f"Cut-Off Percentage: {threshold}%",
|
303
|
-
)
|
304
|
-
ax.tick_params(axis="x", labelsize=20)
|
305
|
-
ax.tick_params(axis="y", labelsize=20)
|
306
|
-
|
307
|
-
ax.set_ylabel(f"{metric.capitalize()} Gap (%)", weight="bold", fontsize=18)
|
308
|
-
ax.set_xlabel("Slice/Segments", weight="bold", fontsize=18)
|
309
|
-
ax.set_title(
|
310
|
-
f"Overfit regions in feature column: {feature_column}",
|
311
|
-
weight="bold",
|
312
|
-
fontsize=20,
|
313
|
-
wrap=True,
|
314
|
-
)
|
315
|
-
|
316
|
-
# Get the legend handles and labels from the barplot
|
317
|
-
handles, labels = barplot.get_legend_handles_labels()
|
318
|
-
|
319
|
-
# Append the axhline handle and label
|
320
|
-
handles.append(axhline)
|
321
|
-
labels.append(axhline.get_label())
|
322
|
-
|
323
|
-
# Create a legend with both hue and axhline labels, the threshold line
|
324
|
-
# will show up twice so remove the first element
|
325
|
-
# barplot.legend(handles=handles[:-1], labels=labels, loc="upper right")
|
326
|
-
barplot.legend(
|
327
|
-
handles=handles[:-1],
|
328
|
-
labels=labels,
|
329
|
-
loc="upper center",
|
330
|
-
bbox_to_anchor=(0.5, 0.1),
|
331
|
-
ncol=len(handles),
|
332
|
-
)
|
333
|
-
|
334
|
-
# Do this if you want to prevent the figure from being displayed
|
335
|
-
plt.close("all")
|
336
|
-
|
337
|
-
return fig
|