validmind 2.4.13__py3-none-any.whl → 2.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,10 +6,12 @@ from dataclasses import dataclass
6
6
  from typing import List
7
7
 
8
8
  import matplotlib.pyplot as plt
9
+ import numpy as np
9
10
  import pandas as pd
10
11
  import seaborn as sns
11
12
  from sklearn import metrics
12
13
 
14
+ from validmind.logging import get_logger
13
15
  from validmind.vm_models import (
14
16
  Figure,
15
17
  ResultSummary,
@@ -17,321 +19,377 @@ from validmind.vm_models import (
17
19
  ResultTableMetadata,
18
20
  ThresholdTest,
19
21
  ThresholdTestResult,
22
+ VMDataset,
23
+ VMModel,
20
24
  )
21
25
 
26
+ logger = get_logger(__name__)
27
+
28
+ DEFAULT_THRESHOLD = 0.04
29
+ PERFORMANCE_METRICS = {
30
+ "accuracy": {
31
+ "function": metrics.accuracy_score,
32
+ "is_classification": True,
33
+ "is_lower_better": False,
34
+ },
35
+ "auc": {
36
+ "function": metrics.roc_auc_score,
37
+ "is_classification": True,
38
+ "is_lower_better": False,
39
+ },
40
+ "f1": {
41
+ "function": metrics.f1_score,
42
+ "is_classification": True,
43
+ "is_lower_better": False,
44
+ },
45
+ "precision": {
46
+ "function": metrics.precision_score,
47
+ "is_classification": True,
48
+ "is_lower_better": False,
49
+ },
50
+ "recall": {
51
+ "function": metrics.recall_score,
52
+ "is_classification": True,
53
+ "is_lower_better": False,
54
+ },
55
+ "mse": {
56
+ "function": metrics.mean_squared_error,
57
+ "is_classification": False,
58
+ "is_lower_better": True,
59
+ },
60
+ "mae": {
61
+ "function": metrics.mean_absolute_error,
62
+ "is_classification": False,
63
+ "is_lower_better": True,
64
+ },
65
+ "r2": {
66
+ "function": metrics.r2_score,
67
+ "is_classification": False,
68
+ "is_lower_better": False,
69
+ },
70
+ "mape": {
71
+ "function": metrics.mean_absolute_percentage_error,
72
+ "is_classification": False,
73
+ "is_lower_better": True,
74
+ },
75
+ }
76
+
77
+
78
+ def _prepare_results(
79
+ results_train: dict, results_test: dict, metric: str
80
+ ) -> pd.DataFrame:
81
+ results_train = pd.DataFrame(results_train)
82
+ results_test = pd.DataFrame(results_test)
83
+ results = results_train.copy()
84
+ results.rename(
85
+ columns={"shape": "training records", f"{metric}": f"training {metric}"},
86
+ inplace=True,
87
+ )
88
+ results[f"test {metric}"] = results_test[metric]
89
+
90
+ # Adjust gap calculation based on metric directionality
91
+ if PERFORMANCE_METRICS[metric]["is_lower_better"]:
92
+ results["gap"] = results[f"test {metric}"] - results[f"training {metric}"]
93
+ else:
94
+ results["gap"] = results[f"training {metric}"] - results[f"test {metric}"]
95
+
96
+ return results
97
+
98
+
99
+ def _compute_metrics(
100
+ results: dict,
101
+ region: str,
102
+ df_region: pd.DataFrame,
103
+ target_column: str,
104
+ prob_column: str,
105
+ pred_column: str,
106
+ feature_column: str,
107
+ metric: str,
108
+ is_classification: bool,
109
+ ) -> None:
110
+ results["slice"].append(str(region))
111
+ results["shape"].append(df_region.shape[0])
112
+ results["feature"].append(feature_column)
113
+
114
+ # Check if any records
115
+ if df_region.empty:
116
+ results[metric].append(0)
117
+ return
118
+
119
+ metric_func = PERFORMANCE_METRICS[metric]["function"]
120
+ y_true = df_region[target_column].values
121
+
122
+ # AUC requires probability scores
123
+ if is_classification and metric == "auc":
124
+ # if only one class is present in the data, return 0
125
+ if len(np.unique(y_true)) == 1:
126
+ results[metric].append(0)
127
+ return
22
128
 
23
- @dataclass
24
- class OverfitDiagnosis(ThresholdTest):
25
- """
26
- Detects and visualizes overfit regions in an ML model by comparing performance on training and test datasets.
27
-
28
- **Purpose**: The OverfitDiagnosis test is devised to detect areas within a Machine Learning model that might be
29
- prone to overfitting. It achieves this by comparing the model's performance on both the training and testing
30
- datasets. These datasets are broken down into distinct sections defined by a Feature Space. Areas, where the model
31
- underperforms by displaying high residual values or a significant amount of overfitting, are highlighted, prompting
32
- actions for mitigation using regularization techniques such as L1 or L2 regularization, Dropout, Early Stopping or
33
- data augmentation.
34
-
35
- **Test Mechanism**: The metric conducts the test by executing the method 'run' on the default parameters and
36
- metrics with 'accuracy' as the specified metric. It segments the feature space by binning crucial feature columns
37
- from both the training and testing datasets. Then, the method computes the prediction results for each defined
38
- region. Subsequently, the prediction's efficacy is evaluated, i.e., the model's performance gap (defined as the
39
- discrepancy between the actual and the model's predictions) for both datasets is calculated and compared with a
40
- preset cut-off value for the overfitting condition. A test failure presents an overfit model, whereas a pass
41
- signifies a fit model. Meanwhile, the function also prepares figures further illustrating the regions with
42
- overfitting.
43
-
44
- **Signs of High Risk**: Indicators of a high-risk model are:
45
- - A high 'gap' value indicating discrepancies in the training and testing data accuracy signals an overfit model.
46
- - Multiple or vast overfitting zones within the feature space suggest overcomplication of the model.
47
-
48
- **Strengths**:
49
- - Presents a visual perspective by plotting regions with overfit issues, simplifying understanding of the model
50
- structure.
51
- - Permits a feature-focused assessment, which promotes specific, targeted modifications to the model.
52
- - Caters to modifications of the testing parameters such as 'cut_off_percentage' and 'features_column' enabling a
53
- personalized analysis.
54
- - Handles both numerical and categorical features.
55
-
56
- **Limitations**:
57
- - Does not currently support regression tasks and is limited to classification tasks only.
58
- - Ineffectual for text-based features, which in turn restricts its usage for Natural Language Processing models.
59
- - Primarily depends on the bins setting, responsible for segmenting the feature space. Different bin configurations
60
- might yield varying results.
61
- - Utilization of a fixed cut-off percentage for making overfitting decisions, set arbitrarily, leading to a
62
- possible risk of inaccuracy.
63
- - Limitation of performance metrics to accuracy alone might prove inadequate for detailed examination, especially
64
- for imbalanced datasets.
129
+ score = metric_func(y_true, df_region[prob_column].values)
130
+
131
+ # All other classification metrics
132
+ elif is_classification:
133
+ score = metric_func(y_true, df_region[pred_column].values)
134
+
135
+ # Regression metrics
136
+ else:
137
+ score = metric_func(y_true, df_region[pred_column].values)
138
+
139
+ results[metric].append(score)
140
+
141
+
142
+ def _plot_overfit_regions(
143
+ df: pd.DataFrame, feature_column: str, threshold: float, metric: str
144
+ ) -> plt.Figure:
145
+ fig, ax = plt.subplots()
146
+ barplot = sns.barplot(data=df, x="slice", y="gap", ax=ax)
147
+ ax.tick_params(axis="x", rotation=90)
148
+
149
+ # Draw threshold line
150
+ axhline = ax.axhline(
151
+ y=threshold,
152
+ color="red",
153
+ linestyle="--",
154
+ linewidth=1,
155
+ label=f"Cut-Off Threshold: {threshold}",
156
+ )
157
+ ax.tick_params(axis="x", labelsize=20)
158
+ ax.tick_params(axis="y", labelsize=20)
159
+
160
+ ax.set_ylabel(f"{metric.upper()} Gap", weight="bold", fontsize=18)
161
+ ax.set_xlabel("Slice/Segments", weight="bold", fontsize=18)
162
+ ax.set_title(
163
+ f"Overfit regions in feature column: {feature_column}",
164
+ weight="bold",
165
+ fontsize=20,
166
+ wrap=True,
167
+ )
168
+
169
+ handles, labels = barplot.get_legend_handles_labels()
170
+ handles.append(axhline)
171
+ labels.append(axhline.get_label())
172
+
173
+ barplot.legend(
174
+ handles=handles[:-1],
175
+ labels=labels,
176
+ loc="upper center",
177
+ bbox_to_anchor=(0.5, 0.1),
178
+ ncol=len(handles),
179
+ )
180
+
181
+ plt.close("all")
182
+
183
+ return fig
184
+
185
+
186
+ # TODO: make this a functional test instead of class-based when appropriate
187
+ # simply have to remove the class and rename this func to OverfitDiagnosis
188
+ def overfit_diagnosis( # noqa: C901
189
+ model: VMModel,
190
+ datasets: List[VMDataset],
191
+ metric: str = None,
192
+ cut_off_threshold: float = DEFAULT_THRESHOLD,
193
+ ):
194
+ """Identify overfit regions in a model's predictions.
195
+
196
+ This test compares the model's performance on training versus test data, grouped by
197
+ feature columns. It calculates the difference between the training and test performance
198
+ for each group and identifies regions where the difference exceeds a specified threshold.
199
+
200
+ This test works for both classification and regression models and with a variety of
201
+ performance metrics. By default, it uses the AUC metric for classification models and
202
+ the MSE metric for regression models. The threshold for identifying overfit regions
203
+ defaults to 0.04 but should be adjusted based on the specific use case.
204
+
205
+ ## Inputs
206
+ - `model` (VMModel): The ValidMind model object to evaluate.
207
+ - `datasets` (List[VMDataset]): A list of two VMDataset objects where the first dataset
208
+ is the training data and the second dataset is the test data.
209
+
210
+ ## Parameters
211
+ - `metric` (str, optional): The performance metric to use for evaluation. Choose from:
212
+ 'accuracy', 'auc', 'f1', 'precision', 'recall', 'mse', 'mae', 'r2', 'mape'.
213
+ Defaults to 'auc' for classification models and 'mse' for regression models.
214
+ - `cut_off_threshold` (float, optional): The threshold for identifying overfit regions.
215
+ Defaults to 0.04.
65
216
  """
66
217
 
67
- name = "overfit_regions"
68
- required_inputs = ["model", "datasets"]
69
- default_params = {"features_columns": None, "cut_off_percentage": 4}
70
- tasks = ["classification", "text_classification"]
71
- tags = [
72
- "sklearn",
73
- "binary_classification",
74
- "multiclass_classification",
75
- "model_diagnosis",
76
- ]
77
-
78
- default_metrics = {
79
- "accuracy": metrics.accuracy_score,
80
- }
218
+ # Determine if it's a classification or regression model
219
+ is_classification = bool(datasets[0].probability_column(model))
81
220
 
82
- def run(self):
83
- if "cut_off_percentage" not in self.params:
84
- raise ValueError("cut_off_percentage must be provided in params")
85
- cut_off_percentage = self.params["cut_off_percentage"]
221
+ # Set default metric if not provided
222
+ if metric is None:
223
+ metric = "auc" if is_classification else "mse"
224
+ logger.info(
225
+ f"Using default {'classification' if is_classification else 'regression'} metric: {metric}"
226
+ )
86
227
 
87
- if "features_columns" not in self.params:
88
- raise ValueError("features_columns must be provided in params")
228
+ if id(cut_off_threshold) == id(DEFAULT_THRESHOLD):
229
+ logger.info("Using default cut-off threshold of 0.04")
89
230
 
90
- if self.params["features_columns"] is None:
91
- features_list = self.inputs.datasets[0].feature_columns
92
- else:
93
- features_list = self.params["features_columns"]
231
+ metric = metric.lower()
232
+ try:
233
+ _metric = PERFORMANCE_METRICS[metric.lower()]
234
+ except KeyError:
235
+ raise ValueError(
236
+ f"Invalid metric. Choose from: {', '.join(PERFORMANCE_METRICS.keys())}"
237
+ )
94
238
 
95
- if self.inputs.datasets[0].text_column in features_list:
96
- raise ValueError(
97
- "Skiping Overfit Diagnosis test for the dataset with text column"
239
+ if is_classification and not _metric["is_classification"]:
240
+ raise ValueError(f"Cannot use regression metric ({metric}) for classification.")
241
+ elif not is_classification and _metric["is_classification"]:
242
+ raise ValueError(f"Cannot use classification metric ({metric}) for regression.")
243
+
244
+ train_df = datasets[0].df
245
+ test_df = datasets[1].df
246
+
247
+ pred_column = f"{datasets[0].target_column}_pred"
248
+ prob_column = f"{datasets[0].target_column}_prob"
249
+
250
+ train_df[pred_column] = datasets[0].y_pred(model)
251
+ test_df[pred_column] = datasets[1].y_pred(model)
252
+
253
+ if is_classification:
254
+ train_df[prob_column] = datasets[0].y_prob(model)
255
+ test_df[prob_column] = datasets[1].y_prob(model)
256
+
257
+ test_results = []
258
+ test_figures = []
259
+ results_headers = ["slice", "shape", "feature", metric]
260
+
261
+ for feature_column in datasets[0].feature_columns:
262
+ bins = 10
263
+ if feature_column in datasets[0].feature_columns_categorical:
264
+ bins = len(train_df[feature_column].unique())
265
+ train_df["bin"] = pd.cut(train_df[feature_column], bins=bins)
266
+
267
+ results_train = {k: [] for k in results_headers}
268
+ results_test = {k: [] for k in results_headers}
269
+
270
+ for region, df_region in train_df.groupby("bin"):
271
+ _compute_metrics(
272
+ results=results_train,
273
+ region=region,
274
+ df_region=df_region,
275
+ feature_column=feature_column,
276
+ target_column=datasets[0].target_column,
277
+ prob_column=prob_column,
278
+ pred_column=pred_column,
279
+ metric=metric,
280
+ is_classification=is_classification,
281
+ )
282
+ df_test_region = test_df[
283
+ (test_df[feature_column] > region.left)
284
+ & (test_df[feature_column] <= region.right)
285
+ ]
286
+ _compute_metrics(
287
+ results=results_test,
288
+ region=region,
289
+ df_region=df_test_region,
290
+ feature_column=feature_column,
291
+ target_column=datasets[1].target_column,
292
+ prob_column=prob_column,
293
+ pred_column=pred_column,
294
+ metric=metric,
295
+ is_classification=is_classification,
98
296
  )
99
297
 
100
- # Check if all elements from features_list are present in the feature columns
101
- all_present = all(
102
- elem in self.inputs.datasets[0].feature_columns for elem in features_list
103
- )
104
- if not all_present:
105
- raise ValueError(
106
- "The list of feature columns provided do not match with training dataset feature columns"
298
+ results = _prepare_results(results_train, results_test, metric)
299
+
300
+ fig = _plot_overfit_regions(results, feature_column, cut_off_threshold, metric)
301
+ test_figures.append(
302
+ Figure(
303
+ key=f"overfit_diagnosis:{metric}:{feature_column}",
304
+ figure=fig,
305
+ metadata={
306
+ "metric": metric,
307
+ "cut_off_threshold": cut_off_threshold,
308
+ "feature": feature_column,
309
+ },
107
310
  )
311
+ )
108
312
 
109
- if not isinstance(features_list, list):
110
- raise ValueError(
111
- "features_columns must be a list of features you would like to test"
313
+ for _, row in results[results["gap"] > cut_off_threshold].iterrows():
314
+ test_results.append(
315
+ {
316
+ "Feature": feature_column,
317
+ "Slice": row["slice"],
318
+ "Number of Records": row["training records"],
319
+ f"Training {metric.upper()}": row[f"training {metric}"],
320
+ f"Test {metric.upper()}": row[f"test {metric}"],
321
+ "Gap": row["gap"],
322
+ }
112
323
  )
113
324
 
114
- target_column = self.inputs.datasets[0].target_column
115
- prediction_column = f"{target_column}_pred"
116
-
117
- # Add prediction column in the training dataset
118
- train_df = self.inputs.datasets[0].df.copy()
119
- train_class_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
120
- train_df[prediction_column] = train_class_pred
121
-
122
- # Add prediction column in the test dataset
123
- test_df = self.inputs.datasets[1].df.copy()
124
- test_class_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
125
- test_df[prediction_column] = test_class_pred
126
-
127
- test_results = []
128
- test_figures = []
129
- results_headers = ["slice", "shape", "feature"]
130
- results_headers.extend(self.default_metrics.keys())
131
-
132
- for feature_column in features_list:
133
- bins = 10
134
- if feature_column in self.inputs.datasets[0].feature_columns_categorical:
135
- bins = len(train_df[feature_column].unique())
136
- train_df["bin"] = pd.cut(train_df[feature_column], bins=bins)
137
-
138
- results_train = {k: [] for k in results_headers}
139
- results_test = {k: [] for k in results_headers}
140
-
141
- for region, df_region in train_df.groupby("bin"):
142
- self._compute_metrics(
143
- results_train,
144
- region,
145
- df_region,
146
- target_column,
147
- prediction_column,
148
- feature_column,
149
- )
150
- df_test_region = test_df[
151
- (test_df[feature_column] > region.left)
152
- & (test_df[feature_column] <= region.right)
153
- ]
154
- self._compute_metrics(
155
- results_test,
156
- region,
157
- df_test_region,
158
- target_column,
159
- prediction_column,
160
- feature_column,
161
- )
325
+ return {"Overfit Diagnosis": test_results}, *test_figures
162
326
 
163
- results = self._prepare_results(results_train, results_test)
164
327
 
165
- fig = self._plot_overfit_regions(
166
- results, feature_column, "accuracy", threshold=cut_off_percentage
167
- )
168
- # We're currently plotting accuracy gap only
169
- test_figures.append(
170
- Figure(
171
- for_object=self,
172
- key=f"{self.name}:accuracy:{feature_column}",
173
- figure=fig,
174
- metadata={
175
- "metric": "accuracy",
176
- "cut_off_percentage": cut_off_percentage,
177
- "feature": feature_column,
178
- },
179
- )
180
- )
328
+ @dataclass
329
+ class OverfitDiagnosis(ThresholdTest):
330
+ """Identify overfit regions in a model's predictions.
331
+
332
+ This test compares the model's performance on training versus test data, grouped by
333
+ feature columns. It calculates the difference between the training and test performance
334
+ for each group and identifies regions where the difference exceeds a specified threshold.
335
+
336
+ This test works for both classification and regression models and with a variety of
337
+ performance metrics. By default, it uses the AUC metric for classification models and
338
+ the MSE metric for regression models. The threshold for identifying overfit regions
339
+ defaults to 0.04 but should be adjusted based on the specific use case.
340
+
341
+ ## Inputs
342
+ - `model` (VMModel): The ValidMind model object to evaluate.
343
+ - `datasets` (List[VMDataset]): A list of two VMDataset objects where the first dataset
344
+ is the training data and the second dataset is the test data.
345
+
346
+ ## Parameters
347
+ - `metric` (str, optional): The performance metric to use for evaluation. Choose from:
348
+ 'accuracy', 'auc', 'f1', 'precision', 'recall', 'mse', 'mae', 'r2', 'mape'.
349
+ Defaults to 'auc' for classification models and 'mse' for regression models.
350
+ - `cut_off_threshold` (float, optional): The threshold for identifying overfit regions.
351
+ Defaults to 0.04.
352
+ """
181
353
 
182
- results = results[results["gap"] > cut_off_percentage]
183
- passed = results.empty
184
- results = results.to_dict(orient="records")
185
- test_results.append(
354
+ required_inputs = ["model", "datasets"]
355
+ default_params = {"metric": None, "cut_off_threshold": DEFAULT_THRESHOLD}
356
+ tasks = ["classification", "regression"]
357
+ tags = [
358
+ "sklearn",
359
+ "binary_classification",
360
+ "multiclass_classification",
361
+ "linear_regression",
362
+ "model_diagnosis",
363
+ ]
364
+
365
+ def run(self):
366
+ func_result = overfit_diagnosis(
367
+ self.inputs.model,
368
+ self.inputs.datasets,
369
+ metric=self.params["metric"],
370
+ cut_off_threshold=self.params["cut_off_threshold"],
371
+ )
372
+
373
+ return self.cache_results(
374
+ test_results_list=[
186
375
  ThresholdTestResult(
187
- test_name="accuracy",
188
- column=feature_column,
189
- passed=passed,
190
- values={"records": results},
376
+ test_name=self.params["metric"],
377
+ column=row["Feature"],
378
+ passed=False,
379
+ values={k: v for k, v in row.items()},
191
380
  )
192
- )
193
- return self.cache_results(
194
- test_results,
195
- passed=all([r.passed for r in test_results]),
196
- figures=test_figures,
381
+ for row in func_result[0]["Overfit Diagnosis"]
382
+ ],
383
+ passed=(not func_result[0]["Overfit Diagnosis"]),
384
+ figures=func_result[1:],
197
385
  )
198
386
 
199
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
200
- results_table = [
201
- record for result in results for record in result.values["records"]
202
- ]
387
+ def summary(self, results, _):
203
388
  return ResultSummary(
204
389
  results=[
205
390
  ResultTable(
206
- data=results_table,
207
- metadata=ResultTableMetadata(title="Overfit Regions Data"),
391
+ data=[result.values for result in results],
392
+ metadata=ResultTableMetadata(title="Overfit Diagnosis"),
208
393
  )
209
- ]
210
- )
211
-
212
- def _prepare_results(self, results_train: dict, results_test: dict) -> pd.DataFrame:
213
- """
214
- Prepares and returns a DataFrame with training and testing results.
215
- Args:
216
- results_train (dict): A dictionary containing training results.
217
- results_test (dict): A dictionary containing testing results.
218
- Returns:
219
- pd.DataFrame: A DataFrame containing the following columns:
220
- - 'shape': The number of training records used.
221
- - 'slice': The name of the region being evaluated.
222
- - 'training accuracy': The accuracy achieved on the training data (in percentage).
223
- - 'test accuracy': The accuracy achieved on the testing data (in percentage).
224
- - 'gap': The difference between the training and testing accuracies (in percentage).
225
- """
226
-
227
- results_train = pd.DataFrame(results_train)
228
- results_test = pd.DataFrame(results_test)
229
- results = results_train.copy()
230
- results.rename(
231
- columns={"shape": "training records", "accuracy": "training accuracy"},
232
- inplace=True,
233
- )
234
- results["training accuracy"] = results["training accuracy"] * 100
235
- results["test accuracy"] = results_test["accuracy"] * 100
236
- results["gap"] = results["training accuracy"] - results["test accuracy"]
237
-
238
- return results
239
-
240
- def _compute_metrics(
241
- self,
242
- results: dict,
243
- region: str,
244
- df_region: pd.DataFrame,
245
- target_column: str,
246
- prediction_column: str,
247
- feature_column: str,
248
- ) -> None:
249
- """
250
- Computes and appends the evaluation metrics for a given region.
251
- Args:
252
- results (dict): A dictionary containing the evaluation results for all regions.
253
- region (str): The name of the region being evaluated.
254
- df_region (pd.DataFrame): The DataFrame containing the region-specific data.
255
- target_column (str): The name of the target column in the DataFrame.
256
- prediction_column (str): The name of the column containing the model's predictions.
257
- Returns:
258
- None
259
- """
260
-
261
- results["slice"].append(str(region))
262
- results["shape"].append(df_region.shape[0])
263
- results["feature"].append(feature_column)
264
-
265
- # Check if df_region is an empty dataframe and if so, append 0 to all metrics
266
- if df_region.empty:
267
- for metric in self.default_metrics.keys():
268
- results[metric].append(0)
269
- return
270
-
271
- y_true = df_region[target_column].values
272
- y_prediction = (
273
- df_region[prediction_column].astype(df_region[target_column].dtypes).values
394
+ ],
274
395
  )
275
-
276
- for metric, metric_fn in self.default_metrics.items():
277
- results[metric].append(metric_fn(y_true, y_prediction))
278
-
279
- def _plot_overfit_regions(
280
- self, df: pd.DataFrame, feature_column: str, metric: str, threshold: float
281
- ) -> plt.Figure:
282
- """
283
- Plots the overfit regions of a given DataFrame.
284
- Args:
285
- df (pd.DataFrame): A DataFrame containing the data to plot.
286
- feature_column (str): The name of the feature column to plot.
287
- threshold (float): The threshold value for the gap, above which a region is considered to be overfit.
288
- Returns:
289
- plt.Figure: A matplotlib Figure object containing the plot.
290
- """
291
-
292
- # Create a bar plot using seaborn library
293
- fig, ax = plt.subplots()
294
- barplot = sns.barplot(data=df, x="slice", y="gap", ax=ax)
295
- ax.tick_params(axis="x", rotation=90)
296
- # Draw threshold line
297
- axhline = ax.axhline(
298
- y=threshold,
299
- color="red",
300
- linestyle="--",
301
- linewidth=1,
302
- label=f"Cut-Off Percentage: {threshold}%",
303
- )
304
- ax.tick_params(axis="x", labelsize=20)
305
- ax.tick_params(axis="y", labelsize=20)
306
-
307
- ax.set_ylabel(f"{metric.capitalize()} Gap (%)", weight="bold", fontsize=18)
308
- ax.set_xlabel("Slice/Segments", weight="bold", fontsize=18)
309
- ax.set_title(
310
- f"Overfit regions in feature column: {feature_column}",
311
- weight="bold",
312
- fontsize=20,
313
- wrap=True,
314
- )
315
-
316
- # Get the legend handles and labels from the barplot
317
- handles, labels = barplot.get_legend_handles_labels()
318
-
319
- # Append the axhline handle and label
320
- handles.append(axhline)
321
- labels.append(axhline.get_label())
322
-
323
- # Create a legend with both hue and axhline labels, the threshold line
324
- # will show up twice so remove the first element
325
- # barplot.legend(handles=handles[:-1], labels=labels, loc="upper right")
326
- barplot.legend(
327
- handles=handles[:-1],
328
- labels=labels,
329
- loc="upper center",
330
- bbox_to_anchor=(0.5, 0.1),
331
- ncol=len(handles),
332
- )
333
-
334
- # Do this if you want to prevent the figure from being displayed
335
- plt.close("all")
336
-
337
- return fig