validmind 2.5.2__py3-none-any.whl → 2.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
validmind/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.5.2"
1
+ __version__ = "2.5.8"
validmind/client.py CHANGED
@@ -240,6 +240,11 @@ def init_model(
240
240
  vm_model = class_obj(
241
241
  pipeline=model,
242
242
  input_id=input_id,
243
+ attributes=(
244
+ ModelAttributes.from_dict(attributes)
245
+ if attributes
246
+ else ModelAttributes()
247
+ ),
243
248
  )
244
249
  # TODO: Add metadata for pipeline model
245
250
  metadata = get_model_info(vm_model)
@@ -248,6 +253,7 @@ def init_model(
248
253
  input_id=input_id,
249
254
  model=model, # Trained model instance
250
255
  predict_fn=predict_fn,
256
+ attributes=ModelAttributes.from_dict(attributes) if attributes else None,
251
257
  **kwargs,
252
258
  )
253
259
  metadata = get_model_info(vm_model)
@@ -56,7 +56,6 @@ TestID = Literal[
56
56
  "validmind.model_validation.ragas.AnswerSimilarity",
57
57
  "validmind.model_validation.ragas.AnswerCorrectness",
58
58
  "validmind.model_validation.ragas.ContextRecall",
59
- "validmind.model_validation.ragas.ContextRelevancy",
60
59
  "validmind.model_validation.ragas.ContextPrecision",
61
60
  "validmind.model_validation.ragas.AnswerRelevance",
62
61
  "validmind.model_validation.sklearn.RegressionModelsPerformanceComparison",
@@ -105,7 +105,7 @@ def AnswerCorrectness(
105
105
  "ground_truth": ground_truth_column,
106
106
  }
107
107
 
108
- df = get_renamed_columns(dataset.df, required_columns)
108
+ df = get_renamed_columns(dataset._df, required_columns)
109
109
 
110
110
  result_df = evaluate(
111
111
  Dataset.from_pandas(df), metrics=[answer_correctness], **get_ragas_config()
@@ -109,7 +109,7 @@ def AnswerRelevance(
109
109
  "contexts": contexts_column,
110
110
  }
111
111
 
112
- df = get_renamed_columns(dataset.df, required_columns)
112
+ df = get_renamed_columns(dataset._df, required_columns)
113
113
 
114
114
  result_df = evaluate(
115
115
  Dataset.from_pandas(df), metrics=[answer_relevancy], **get_ragas_config()
@@ -94,7 +94,7 @@ def AnswerSimilarity(
94
94
  "ground_truth": ground_truth_column,
95
95
  }
96
96
 
97
- df = get_renamed_columns(dataset.df, required_columns)
97
+ df = get_renamed_columns(dataset._df, required_columns)
98
98
 
99
99
  result_df = evaluate(
100
100
  Dataset.from_pandas(df), metrics=[answer_similarity], **get_ragas_config()
@@ -132,7 +132,7 @@ def AspectCritique(
132
132
  "contexts": contexts_column,
133
133
  }
134
134
 
135
- df = get_renamed_columns(dataset.df, required_columns)
135
+ df = get_renamed_columns(dataset._df, required_columns)
136
136
 
137
137
  built_in_aspects = [aspect_map[aspect] for aspect in aspects]
138
138
  custom_aspects = (
@@ -100,7 +100,7 @@ def ContextEntityRecall(
100
100
  "contexts": contexts_column,
101
101
  }
102
102
 
103
- df = get_renamed_columns(dataset.df, required_columns)
103
+ df = get_renamed_columns(dataset._df, required_columns)
104
104
 
105
105
  result_df = evaluate(
106
106
  Dataset.from_pandas(df), metrics=[context_entity_recall], **get_ragas_config()
@@ -96,7 +96,7 @@ def ContextPrecision(
96
96
  "ground_truth": ground_truth_column,
97
97
  }
98
98
 
99
- df = get_renamed_columns(dataset.df, required_columns)
99
+ df = get_renamed_columns(dataset._df, required_columns)
100
100
 
101
101
  result_df = evaluate(
102
102
  Dataset.from_pandas(df), metrics=[context_precision], **get_ragas_config()
@@ -96,7 +96,7 @@ def ContextRecall(
96
96
  "ground_truth": ground_truth_column,
97
97
  }
98
98
 
99
- df = get_renamed_columns(dataset.df, required_columns)
99
+ df = get_renamed_columns(dataset._df, required_columns)
100
100
 
101
101
  result_df = evaluate(
102
102
  Dataset.from_pandas(df), metrics=[context_recall], **get_ragas_config()
@@ -94,7 +94,7 @@ def Faithfulness(
94
94
  "contexts": contexts_column,
95
95
  }
96
96
 
97
- df = get_renamed_columns(dataset.df, required_columns)
97
+ df = get_renamed_columns(dataset._df, required_columns)
98
98
 
99
99
  result_df = evaluate(
100
100
  Dataset.from_pandas(df), metrics=[faithfulness], **get_ragas_config()
@@ -25,51 +25,48 @@ from validmind.vm_models import (
25
25
 
26
26
  logger = get_logger(__name__)
27
27
 
28
+ # TODO: A couple of improvements here could be to:
29
+ # 1. Allow the test to use multiple metrics at once
30
+ # 2. Allow custom functions for computing performance
31
+
28
32
  DEFAULT_THRESHOLD = 0.04
33
+ DEFAULT_CLASSIFICATION_METRIC = "auc"
34
+ DEFAULT_REGRESSION_METRIC = "mse"
29
35
  PERFORMANCE_METRICS = {
30
36
  "accuracy": {
31
37
  "function": metrics.accuracy_score,
32
- "is_classification": True,
33
38
  "is_lower_better": False,
34
39
  },
35
40
  "auc": {
36
41
  "function": metrics.roc_auc_score,
37
- "is_classification": True,
38
42
  "is_lower_better": False,
39
43
  },
40
44
  "f1": {
41
45
  "function": metrics.f1_score,
42
- "is_classification": True,
43
46
  "is_lower_better": False,
44
47
  },
45
48
  "precision": {
46
49
  "function": metrics.precision_score,
47
- "is_classification": True,
48
50
  "is_lower_better": False,
49
51
  },
50
52
  "recall": {
51
53
  "function": metrics.recall_score,
52
- "is_classification": True,
53
54
  "is_lower_better": False,
54
55
  },
55
56
  "mse": {
56
57
  "function": metrics.mean_squared_error,
57
- "is_classification": False,
58
58
  "is_lower_better": True,
59
59
  },
60
60
  "mae": {
61
61
  "function": metrics.mean_absolute_error,
62
- "is_classification": False,
63
62
  "is_lower_better": True,
64
63
  },
65
64
  "r2": {
66
65
  "function": metrics.r2_score,
67
- "is_classification": False,
68
66
  "is_lower_better": False,
69
67
  },
70
68
  "mape": {
71
69
  "function": metrics.mean_absolute_percentage_error,
72
- "is_classification": False,
73
70
  "is_lower_better": True,
74
71
  },
75
72
  }
@@ -123,20 +120,13 @@ def _compute_metrics(
123
120
  if is_classification and metric == "auc":
124
121
  # if only one class is present in the data, return 0
125
122
  if len(np.unique(y_true)) == 1:
126
- results[metric].append(0)
127
- return
128
-
129
- score = metric_func(y_true, df_region[prob_column].values)
130
-
131
- # All other classification metrics
132
- elif is_classification:
133
- score = metric_func(y_true, df_region[pred_column].values)
123
+ return results[metric].append(0)
134
124
 
135
- # Regression metrics
136
- else:
137
- score = metric_func(y_true, df_region[pred_column].values)
125
+ return results[metric].append(
126
+ metric_func(y_true, df_region[prob_column].values)
127
+ )
138
128
 
139
- results[metric].append(score)
129
+ return results[metric].append(metric_func(y_true, df_region[pred_column].values))
140
130
 
141
131
 
142
132
  def _plot_overfit_regions(
@@ -219,8 +209,12 @@ def overfit_diagnosis( # noqa: C901
219
209
  is_classification = bool(datasets[0].probability_column(model))
220
210
 
221
211
  # Set default metric if not provided
222
- if metric is None:
223
- metric = "auc" if is_classification else "mse"
212
+ if not metric:
213
+ metric = (
214
+ DEFAULT_CLASSIFICATION_METRIC
215
+ if is_classification
216
+ else DEFAULT_REGRESSION_METRIC
217
+ )
224
218
  logger.info(
225
219
  f"Using default {'classification' if is_classification else 'regression'} metric: {metric}"
226
220
  )
@@ -228,19 +222,6 @@ def overfit_diagnosis( # noqa: C901
228
222
  if id(cut_off_threshold) == id(DEFAULT_THRESHOLD):
229
223
  logger.info("Using default cut-off threshold of 0.04")
230
224
 
231
- metric = metric.lower()
232
- try:
233
- _metric = PERFORMANCE_METRICS[metric.lower()]
234
- except KeyError:
235
- raise ValueError(
236
- f"Invalid metric. Choose from: {', '.join(PERFORMANCE_METRICS.keys())}"
237
- )
238
-
239
- if is_classification and not _metric["is_classification"]:
240
- raise ValueError(f"Cannot use regression metric ({metric}) for classification.")
241
- elif not is_classification and _metric["is_classification"]:
242
- raise ValueError(f"Cannot use classification metric ({metric}) for regression.")
243
-
244
225
  train_df = datasets[0].df
245
226
  test_df = datasets[1].df
246
227
 
@@ -2,17 +2,19 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
+ from collections import defaultdict
5
6
  from dataclasses import dataclass
6
7
  from operator import add
7
8
  from typing import List, Tuple
8
9
 
9
- import matplotlib.pyplot as plt
10
10
  import numpy as np
11
11
  import pandas as pd
12
+ import plotly.graph_objects as go
12
13
  import seaborn as sns
13
14
  from sklearn import metrics
14
15
 
15
16
  from validmind.errors import MissingOrInvalidModelPredictFnError
17
+ from validmind.logging import get_logger
16
18
  from validmind.vm_models import (
17
19
  Figure,
18
20
  ResultSummary,
@@ -20,291 +22,384 @@ from validmind.vm_models import (
20
22
  ResultTableMetadata,
21
23
  ThresholdTest,
22
24
  ThresholdTestResult,
25
+ VMDataset,
26
+ VMModel,
23
27
  )
24
28
 
25
-
26
- # TODO: make this support regression and classification as well as more performance metrics
27
- @dataclass
28
- class RobustnessDiagnosis(ThresholdTest):
29
+ logger = get_logger(__name__)
30
+
31
+ DEFAULT_DECAY_THRESHOLD = 0.05
32
+ DEFAULT_STD_DEV_LIST = [0.1, 0.2, 0.3, 0.4, 0.5]
33
+ DEFAULT_CLASSIFICATION_METRIC = "auc"
34
+ DEFAULT_REGRESSION_METRIC = "mse"
35
+ PERFORMANCE_METRICS = {
36
+ "accuracy": {
37
+ "function": metrics.accuracy_score,
38
+ "is_lower_better": False,
39
+ },
40
+ "auc": {
41
+ "function": metrics.roc_auc_score,
42
+ "is_lower_better": False,
43
+ },
44
+ "f1": {
45
+ "function": metrics.f1_score,
46
+ "is_lower_better": False,
47
+ },
48
+ "precision": {
49
+ "function": metrics.precision_score,
50
+ "is_lower_better": False,
51
+ },
52
+ "recall": {
53
+ "function": metrics.recall_score,
54
+ "is_lower_better": False,
55
+ },
56
+ "mse": {
57
+ "function": metrics.mean_squared_error,
58
+ "is_lower_better": True,
59
+ },
60
+ "mae": {
61
+ "function": metrics.mean_absolute_error,
62
+ "is_lower_better": True,
63
+ },
64
+ "r2": {
65
+ "function": metrics.r2_score,
66
+ "is_lower_better": False,
67
+ },
68
+ "mape": {
69
+ "function": metrics.mean_absolute_percentage_error,
70
+ "is_lower_better": True,
71
+ },
72
+ }
73
+
74
+
75
+ def _add_noise_std_dev(
76
+ values: List[float], x_std_dev: float
77
+ ) -> Tuple[List[float], float]:
29
78
  """
30
- Evaluates the robustness of a machine learning model by injecting Gaussian noise to input data and measuring
31
- performance.
32
-
33
- **Purpose**:
34
-
35
- The purpose of this test code is to evaluate the robustness of a machine learning model. Robustness refers to a
36
- model's ability to maintain a high level of performance in the face of perturbations or changes—particularly
37
- noise—added to its input data. This test is designed to help gauge how well the model can handle potential
38
- real-world scenarios where the input data might be incomplete or corrupted.
39
-
40
- **Test Mechanism**:
41
-
42
- This test is conducted by adding Gaussian noise, proportional to a particular standard deviation scale, to numeric
43
- input features of both the training and testing datasets. The model performance in the face of these perturbed
44
- features is then evaluated using the ROC_AUC score. This process is iterated over a range of scale
45
- factors. The resulting auc trend against the amount of noise introduced is illustrated with a line chart. A
46
- predetermined threshold determines what level of auc decay due to perturbation is considered acceptable.
47
-
48
- **Signs of High Risk**:
49
- - Substantial decreases in auc when noise is introduced to feature inputs.
50
- - The decay in auc surpasses the configured threshold, indicating that the model is not robust against input
51
- noise.
52
- - Instances where one or more elements provided in the features list don't match with the training dataset's
53
- numerical feature columns.
54
-
55
- **Strengths**:
56
- - Provides an empirical measure of the model's performance in tackling noise or data perturbations, revealing
57
- insights into the model's stability.
58
- - Offers flexibility with the ability to choose specific features to perturb and control the level of noise applied.
59
- - Detailed results visualization helps in interpreting the outcome of robustness testing.
60
-
61
- **Limitations**:
62
- - The default threshold for auc decay is set to 0.05, which is unlikely to be optimal for most use cases and
63
- should be adjusted based on domain expertise to suit the needs of the specific model.
64
- - Only numerical features are perturbed, leaving out non-numerical features, which can lead to an incomplete
65
- analysis of robustness.
66
- - The test is contingent on the assumption that the added Gaussian noise sufficiently represents potential data
67
- corruption or incompleteness in real-world scenarios.
79
+ Adds Gaussian noise to a list of values.
80
+ Args:
81
+ values (list[float]): A list of numerical values to which noise is added.
82
+ x_std_dev (float): A scaling factor for the standard deviation of the noise.
83
+ Returns:
84
+ tuple[list[float], float]: A tuple containing:
85
+ - A list of noisy values, where each value is the sum of the corresponding value
86
+ in the input list and a randomly generated value sampled from a Gaussian distribution
87
+ with mean 0 and standard deviation x_std_dev times the standard deviation of the input list.
88
+ - The standard deviation of the input list of values.
68
89
  """
90
+ std_dev = np.std(values)
91
+ noise_list = np.random.normal(0, x_std_dev * std_dev, size=len(values))
92
+ noisy_values = list(map(add, noise_list, values))
69
93
 
70
- name = "robustness"
71
- required_inputs = ["model", "datasets"]
72
- default_params = {
73
- "features_columns": None,
74
- "scaling_factor_std_dev_list": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
75
- "auc_decay_threshold": 0.05,
76
- }
77
- tasks = ["classification"]
78
- tags = [
79
- "sklearn",
80
- "binary_classification",
81
- "multiclass_classification",
82
- "model_diagnosis",
83
- "visualization",
84
- ]
94
+ return noisy_values
85
95
 
86
- def run(self):
87
- # Validate X std deviation parameter
88
- if "scaling_factor_std_dev_list" not in self.params:
89
- raise ValueError("scaling_factor_std_dev_list must be provided in params")
90
- x_std_dev_list = self.params["scaling_factor_std_dev_list"]
91
96
 
92
- if self.params["auc_decay_threshold"] is None:
93
- raise ValueError("auc_decay_threshold must be provided in params")
94
- auc_threshold = self.params["auc_decay_threshold"]
97
+ def _compute_metric(
98
+ dataset: VMDataset, model: VMModel, X: pd.DataFrame, metric: str
99
+ ) -> float:
100
+ if metric not in PERFORMANCE_METRICS:
101
+ raise ValueError(
102
+ f"Invalid metric: {metric}, expected one of {PERFORMANCE_METRICS.keys()}"
103
+ )
95
104
 
96
- if self.inputs.model is None:
97
- raise ValueError("model must of provided to run this test")
105
+ if metric == "auc":
106
+ try:
107
+ y_proba = model.predict_proba(X)
108
+ except MissingOrInvalidModelPredictFnError:
109
+ y_proba = model.predict(X)
110
+ return metrics.roc_auc_score(dataset.y, y_proba)
98
111
 
99
- # Validate list of features columns need to be perterubed
100
- if "features_columns" not in self.params:
101
- raise ValueError("features_columns must be provided in params")
112
+ return PERFORMANCE_METRICS[metric]["function"](dataset.y, model.predict(X))
102
113
 
103
- features_list = self.params["features_columns"]
104
- if features_list is None:
105
- features_list = self.inputs.datasets[0].feature_columns
106
114
 
107
- # Check if all elements from features_list are present in the numerical feature columns
108
- all_present = all(
109
- elem in self.inputs.datasets[0].feature_columns for elem in features_list
110
- )
111
- if not all_present:
112
- raise ValueError(
113
- "The list of feature columns provided do not match with training "
114
- + "dataset numerical feature columns"
115
- )
115
+ def _compute_gap(result: dict, metric: str) -> float:
116
+ if PERFORMANCE_METRICS[metric]["is_lower_better"]:
117
+ return result[metric.upper()][-1] - result[metric.upper()][0]
116
118
 
117
- if self.inputs.datasets[0].text_column in features_list:
118
- raise ValueError(
119
- "Skiping Robustness Diagnosis test for the dataset with text column"
120
- )
119
+ return result[metric.upper()][0] - result[metric.upper()][-1]
121
120
 
122
- train_df = self.inputs.datasets[0].x_df().copy()
123
- train_y_true = self.inputs.datasets[0].y
124
121
 
125
- test_df = self.inputs.datasets[1].x_df().copy()
126
- test_y_true = self.inputs.datasets[1].y
122
+ def _combine_results(results: List[dict]):
123
+ final_results = defaultdict(list)
127
124
 
128
- test_results = []
129
- test_figures = []
125
+ # Interleave rows from each dictionary
126
+ for i in range(len(results[0]["Perturbation Size"])):
127
+ for result in results:
128
+ for key in result.keys():
129
+ final_results[key].append(result[key][i])
130
130
 
131
- results_headers = ["Perturbation Size", "Dataset Type", "Records", "AUC"]
132
- results = {k: [] for k in results_headers}
133
- # Iterate scaling factor for the standard deviation list
134
- for x_std_dev in x_std_dev_list:
135
- temp_train_df = train_df.copy()
136
- temp_test_df = test_df.copy()
131
+ return pd.DataFrame(final_results)
137
132
 
138
- # Add noise to numeric features columns provided by user
139
- for feature in features_list:
140
- temp_train_df[feature] = self._add_noise_std_dev(
141
- temp_train_df[feature].to_list(), x_std_dev
142
- )
143
- temp_test_df[feature] = self._add_noise_std_dev(
144
- temp_test_df[feature].to_list(), x_std_dev
145
- )
146
133
 
147
- self._compute_metrics(
148
- results, temp_train_df, train_y_true, x_std_dev, "Training"
134
+ def _plot_robustness(
135
+ results: pd.DataFrame, metric: str, threshold: float, columns: List[str], model: str
136
+ ):
137
+ fig = go.Figure()
138
+
139
+ datasets = results["Dataset"].unique()
140
+ pallete = [
141
+ f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}"
142
+ for r, g, b in sns.color_palette("husl", len(datasets))
143
+ ]
144
+
145
+ for i, dataset in enumerate(datasets):
146
+ dataset_results = results[results["Dataset"] == dataset]
147
+ fig.add_trace(
148
+ go.Scatter(
149
+ x=dataset_results["Perturbation Size"],
150
+ y=dataset_results[metric.upper()],
151
+ mode="lines+markers",
152
+ name=dataset,
153
+ line=dict(width=3, color=pallete[i]),
154
+ marker=dict(size=10),
149
155
  )
150
- self._compute_metrics(results, temp_test_df, test_y_true, x_std_dev, "Test")
151
-
152
- fig, df = self._plot_robustness(results, features_list)
153
-
154
- test_figures.append(
155
- Figure(
156
- for_object=self,
157
- key=f"{self.name}:auc",
158
- figure=fig,
159
- metadata={
160
- "metric": "AUC",
161
- "features_list": features_list,
162
- },
156
+ )
157
+
158
+ if PERFORMANCE_METRICS[metric]["is_lower_better"]:
159
+ y_label = f"{metric.upper()} (lower is better)"
160
+ else:
161
+ threshold = -threshold
162
+ y_label = f"{metric.upper()} (higher is better)"
163
+
164
+ # add threshold lines
165
+ for i, dataset in enumerate(datasets):
166
+ baseline = results[results["Dataset"] == dataset][metric.upper()].iloc[0]
167
+ fig.add_trace(
168
+ go.Scatter(
169
+ x=results["Perturbation Size"].unique(),
170
+ y=[baseline + threshold] * len(results["Perturbation Size"].unique()),
171
+ mode="lines",
172
+ name=f"threshold_{dataset}",
173
+ line=dict(dash="dash", width=2, color=pallete[i]),
174
+ showlegend=True,
163
175
  )
164
176
  )
165
177
 
166
- train_auc = df.loc[(df["Dataset Type"] == "Training"), "AUC"].values[0]
167
- test_auc = df.loc[(df["Dataset Type"] == "Test"), "AUC"].values[0]
168
-
169
- df["Passed"] = np.where(
170
- (df["Dataset Type"] == "Training")
171
- & (df["AUC"] >= (train_auc - auc_threshold)),
172
- True,
173
- np.where(
174
- (df["Dataset Type"] == "Test")
175
- & (df["AUC"] >= (test_auc - auc_threshold)),
176
- True,
177
- False,
178
+ columns_lines = [""]
179
+ for column in columns:
180
+ # keep adding to the last line in list until character limit (40)
181
+ if len(columns_lines[-1]) + len(column) < 40:
182
+ columns_lines[-1] += f"{column}, "
183
+ else:
184
+ columns_lines.append(f"{column}, ")
185
+
186
+ fig.update_layout(
187
+ title=dict(
188
+ text=(
189
+ f"Model Robustness for '{model}'<br><sup>As determined by calculating "
190
+ f"{metric.upper()} decay in the presence of random gaussian noise</sup>"
178
191
  ),
192
+ font=dict(size=20),
193
+ x=0.5,
194
+ xanchor="center",
195
+ ),
196
+ xaxis_title=dict(
197
+ text="Perturbation Size (X * Standard Deviation)",
198
+ ),
199
+ yaxis_title=dict(text=y_label),
200
+ plot_bgcolor="white",
201
+ margin=dict(t=60, b=80, r=20, l=60),
202
+ xaxis=dict(showgrid=True, gridcolor="lightgrey"),
203
+ yaxis=dict(showgrid=True, gridcolor="lightgrey"),
204
+ annotations=[
205
+ go.layout.Annotation(
206
+ text=f"Perturbed Features:<br><sup>{'<br>'.join(columns_lines)}</sup>",
207
+ align="left",
208
+ font=dict(size=14),
209
+ bordercolor="lightgrey",
210
+ borderwidth=1,
211
+ borderpad=4,
212
+ showarrow=False,
213
+ x=1.025,
214
+ xref="paper",
215
+ xanchor="left",
216
+ y=-0.15,
217
+ yref="paper",
218
+ )
219
+ ],
220
+ )
221
+
222
+ return fig
223
+
224
+
225
+ # TODO: make this a functional test instead of class-based when appropriate
226
+ # simply have to remove the class and rename this func to OverfitDiagnosis
227
+ def robustness_diagnosis(
228
+ model: VMModel,
229
+ datasets: List[VMDataset],
230
+ metric: str = None,
231
+ scaling_factor_std_dev_list: List[float] = DEFAULT_STD_DEV_LIST,
232
+ performance_decay_threshold: float = DEFAULT_DECAY_THRESHOLD,
233
+ ):
234
+ if not metric:
235
+ metric = (
236
+ DEFAULT_CLASSIFICATION_METRIC
237
+ if datasets[0].probability_column(model)
238
+ else DEFAULT_REGRESSION_METRIC
239
+ )
240
+ logger.info(f"Using default metric ({metric.upper()}) for robustness diagnosis")
241
+
242
+ if id(scaling_factor_std_dev_list) == id(DEFAULT_STD_DEV_LIST):
243
+ logger.info(
244
+ f"Using default scaling factors for the standard deviation of the noise: {DEFAULT_STD_DEV_LIST}"
245
+ )
246
+
247
+ if id(performance_decay_threshold) == id(DEFAULT_DECAY_THRESHOLD):
248
+ logger.info(
249
+ f"Using default performance decay threshold of {DEFAULT_DECAY_THRESHOLD}"
179
250
  )
180
- test_results.append(
181
- ThresholdTestResult(
182
- test_name="AUC",
183
- column=features_list,
184
- passed=True,
185
- values={"records": df.to_dict("records")},
251
+
252
+ results = [{} for _ in range(len(datasets))]
253
+
254
+ # add baseline results (no perturbation)
255
+ for dataset, result in zip(datasets, results):
256
+ result["Perturbation Size"] = [0.0]
257
+ result["Dataset"] = [f"{dataset.input_id}"]
258
+ result["Row Count"] = [dataset._df.shape[0]]
259
+
260
+ result[metric.upper()] = [
261
+ _compute_metric(
262
+ dataset=dataset,
263
+ model=model,
264
+ X=dataset.x_df(),
265
+ metric=metric,
186
266
  )
267
+ ]
268
+ result["Performance Decay"] = [0.0]
269
+ result["Passed"] = [True]
270
+
271
+ # Iterate scaling factor for the standard deviation list
272
+ for x_std_dev in scaling_factor_std_dev_list:
273
+ for dataset, result in zip(datasets, results):
274
+
275
+ result["Perturbation Size"].append(x_std_dev)
276
+ result["Dataset"].append(result["Dataset"][0])
277
+ result["Row Count"].append(result["Row Count"][0])
278
+
279
+ temp_df = dataset.x_df().copy()
280
+ for feature in dataset.feature_columns_numeric:
281
+ temp_df[feature] = _add_noise_std_dev(
282
+ values=temp_df[feature].to_list(),
283
+ x_std_dev=x_std_dev,
284
+ )
285
+
286
+ result[metric.upper()].append(
287
+ _compute_metric(
288
+ dataset=dataset,
289
+ model=model,
290
+ X=temp_df,
291
+ metric=metric,
292
+ )
293
+ )
294
+ result["Performance Decay"].append(_compute_gap(result, metric))
295
+ result["Passed"].append(
296
+ result["Performance Decay"][-1] < performance_decay_threshold
297
+ )
298
+
299
+ results_df = _combine_results(results)
300
+ fig = _plot_robustness(
301
+ results=results_df,
302
+ metric=metric,
303
+ threshold=performance_decay_threshold,
304
+ columns=datasets[0].feature_columns_numeric,
305
+ model=model.input_id,
306
+ )
307
+
308
+ # rename perturbation size for baseline
309
+ results_df["Perturbation Size"][
310
+ results_df["Perturbation Size"] == 0.0
311
+ ] = "Baseline (0.0)"
312
+
313
+ return results_df, fig
314
+
315
+
316
+ @dataclass
317
+ class RobustnessDiagnosis(ThresholdTest):
318
+ """Evaluate the robustness of a machine learning model to noise
319
+
320
+ Robustness refers to a model's ability to maintain a high level of performance in
321
+ the face of perturbations or changes (particularly noise) added to its input data.
322
+ This test is designed to help gauge how well the model can handle potential real-
323
+ world scenarios where the input data might be incomplete or corrupted.
324
+
325
+ ## Test Methodology
326
+ This test is conducted by adding Gaussian noise, proportional to a particular standard
327
+ deviation scale, to numeric input features of the input datasets. The model's
328
+ performance on the perturbed data is then evaluated using a user-defined metric or the
329
+ default metric of AUC for classification tasks and MSE for regression tasks. The results
330
+ are then plotted to visualize the model's performance decay as the perturbation size
331
+ increases.
332
+
333
+ When using this test, it is highly recommended to tailor the performance metric, list
334
+ of scaling factors for the standard deviation of the noise, and the performance decay
335
+ threshold to the specific use case of the model being evaluated.
336
+
337
+ **Inputs**:
338
+ - model (VMModel): The trained model to be evaluated.
339
+ - datasets (List[VMDataset]): A list of datasets to evaluate the model against.
340
+
341
+ ## Parameters
342
+ - metric (str, optional): The performance metric to be used for evaluation. If not
343
+ provided, the default metric is used based on the task of the model. Default values
344
+ are "auc" for classification tasks and "mse" for regression tasks.
345
+ - scaling_factor_std_dev_list (List[float], optional): A list of scaling factors for
346
+ the standard deviation of the noise to be added to the input features. The default
347
+ values are [0.1, 0.2, 0.3, 0.4, 0.5].
348
+ - performance_decay_threshold (float, optional): The threshold for the performance
349
+ decay of the model. The default value is 0.05.
350
+ """
351
+
352
+ name = "robustness"
353
+ required_inputs = ["model", "datasets"]
354
+ default_params = {
355
+ "metric": None,
356
+ "scaling_factor_std_dev_list": DEFAULT_STD_DEV_LIST,
357
+ "performance_decay_threshold": DEFAULT_DECAY_THRESHOLD,
358
+ }
359
+ tasks = ["classification", "regression"]
360
+ tags = [
361
+ "sklearn",
362
+ "model_diagnosis",
363
+ "visualization",
364
+ ]
365
+
366
+ def run(self):
367
+ results, fig = robustness_diagnosis(
368
+ model=self.inputs.model,
369
+ datasets=self.inputs.datasets,
370
+ metric=self.params["metric"],
371
+ scaling_factor_std_dev_list=self.params["scaling_factor_std_dev_list"],
372
+ performance_decay_threshold=self.params["performance_decay_threshold"],
187
373
  )
374
+
188
375
  return self.cache_results(
189
- test_results, passed=df["Passed"].all(), figures=test_figures
376
+ passed=results["Passed"].all(),
377
+ test_results_list=[
378
+ ThresholdTestResult(
379
+ test_name=self.params["metric"],
380
+ passed=results["Passed"].all(),
381
+ values=results.to_dict(orient="records"),
382
+ )
383
+ ],
384
+ figures=[
385
+ Figure(
386
+ for_object=self,
387
+ key=f"{self.name}:{self.params['metric']}",
388
+ figure=fig,
389
+ )
390
+ ],
190
391
  )
191
392
 
192
393
  def summary(self, results: List[ThresholdTestResult], _):
193
- results_table = [
194
- record for result in results for record in result.values["records"]
195
- ]
196
394
  return ResultSummary(
197
395
  results=[
198
396
  ResultTable(
199
- data=results_table,
200
- metadata=ResultTableMetadata(title="Robustness test"),
397
+ data=results[0].values,
398
+ metadata=ResultTableMetadata(title="Robustness Diagnosis Results"),
201
399
  )
202
400
  ]
203
401
  )
204
402
 
205
- def _compute_metrics(
206
- self,
207
- results: dict,
208
- df: pd.DataFrame,
209
- y_true: str,
210
- x_std_dev: float,
211
- dataset_type: str,
212
- ):
213
- """
214
- Compute evaluation metrics for a given perturbed dataset.
215
- Args:
216
- results (dict): A dictionary to store the results of the computation.
217
- df (pd.DataFrame): A Pandas dataframe containing the dataset to evaluate.
218
- y_true (str): A string representing the name of the column containing the true target values.
219
- x_std_dev (float): A float representing the standard deviation of the perturbation applied to the dataset.
220
- dataset_type (str): A string representing the type of dataset (e.g. "training", "validation", "test").
221
- Returns:
222
- None
223
- """
224
- results["Dataset Type"].append(dataset_type)
225
- results["Perturbation Size"].append(x_std_dev)
226
- results["Records"].append(df.shape[0])
227
-
228
- try:
229
- y_proba = self.inputs.model.predict_proba(df)
230
- except MissingOrInvalidModelPredictFnError:
231
- y_proba = self.inputs.model.predict(df)
232
-
233
- results["AUC"].append(metrics.roc_auc_score(y_true, y_proba))
234
-
235
- def _add_noise_std_dev(
236
- self, values: List[float], x_std_dev: float
237
- ) -> Tuple[List[float], float]:
238
- """
239
- Adds Gaussian noise to a list of values.
240
- Args:
241
- values (list[float]): A list of numerical values to which noise is added.
242
- x_std_dev (float): A scaling factor for the standard deviation of the noise.
243
- Returns:
244
- tuple[list[float], float]: A tuple containing:
245
- - A list of noisy values, where each value is the sum of the corresponding value
246
- in the input list and a randomly generated value sampled from a Gaussian distribution
247
- with mean 0 and standard deviation x_std_dev times the standard deviation of the input list.
248
- - The standard deviation of the input list of values.
249
- """
250
- std_dev = np.std(values)
251
- noise_list = np.random.normal(0, x_std_dev * std_dev, size=len(values))
252
- noisy_values = list(map(add, noise_list, values))
253
-
254
- return noisy_values
255
-
256
- def _plot_robustness(self, results: dict, features_columns: List[str]):
257
- """
258
- Plots the model's auc under feature perturbations.
259
- Args:
260
- results (dict): A dictionary containing the results of the evaluation.
261
- It has the following keys:
262
- - 'Dataset Type': the type of dataset evaluated, e.g. 'Training' or 'Test'.
263
- - 'Perturbation Size': the size of the perturbation applied to the features.
264
- - 'Records': the number of records evaluated.
265
- - 'auc': the ROC AUC score obtained for the evaluation.
266
- The values of each key are lists containing the results for each evaluation.
267
- features_columns (list[str]): A list containing the names of the features perturbed.
268
- Returns:
269
- tuple[matplotlib.figure.Figure, pd.DataFrame]: A tuple containing the matplotlib Figure object
270
- and a DataFrame containing the results used to generate the plot.
271
- """
272
- df = pd.DataFrame(results)
273
-
274
- # Create a bar plot using seaborn library
275
- fig, ax = plt.subplots()
276
- sns.lineplot(
277
- data=df,
278
- x="Perturbation Size",
279
- y="AUC",
280
- hue="Dataset Type",
281
- style="Dataset Type",
282
- linewidth=3,
283
- markers=True,
284
- markersize=10,
285
- dashes=False,
286
- palette=["red", "blue"],
287
- ax=ax,
288
- )
289
- ax.tick_params(axis="x")
290
- ax.set_ylabel("AUC", weight="bold", fontsize=18)
291
- ax.legend(fontsize=18)
292
- ax.set_xlabel(
293
- "Perturbation Size (X * Standard Deviation)", weight="bold", fontsize=18
294
- )
295
- ax.set_title(
296
- f"Perturbed Features: {', '.join(features_columns)}",
297
- weight="bold",
298
- fontsize=20,
299
- wrap=True,
300
- )
301
-
302
- # Do this if you want to prevent the figure from being displayed
303
- plt.close("all")
304
-
305
- # fig, ax = plt.subplots()
306
- return fig, df
307
-
308
403
  def test(self):
309
404
  """Unit Test for Robustness Diagnosis Threshold Test"""
310
405
  # Verify the result object is present
@@ -313,16 +408,8 @@ class RobustnessDiagnosis(ThresholdTest):
313
408
  # Verify test results and their type
314
409
  assert isinstance(self.result.test_results.results, list)
315
410
 
316
- # Check for presence and validity of 'values' dict and 'passed' flag in each result
411
+ # Check for presence and validity of 'values' and 'passed' flag in each result
317
412
  for test_result in self.result.test_results.results:
318
413
  assert "values" in test_result.__dict__
319
414
  assert "passed" in test_result.__dict__
320
- assert isinstance(test_result.values, dict)
321
- assert "records" in test_result.values
322
-
323
- # For unperturbed training dataset, auc should be present
324
- if (
325
- test_result.column == self.params["features_columns"]
326
- and 0.0 in test_result.values["records"][0]["Perturbation Size"]
327
- ):
328
- assert "AUC" in test_result.values["records"][0]
415
+ assert isinstance(test_result.values, list)
validmind/tests/run.py CHANGED
@@ -405,7 +405,7 @@ def run_test(
405
405
 
406
406
  if unit_metrics:
407
407
  metric_id_name = "".join(word[0].upper() + word[1:] for word in name.split())
408
- test_id = f"validmind.composite_test.{metric_id_name}"
408
+ test_id = f"validmind.composite_metric.{metric_id_name}"
409
409
 
410
410
  error, TestClass = load_composite_metric(
411
411
  unit_metrics=unit_metrics, metric_name=metric_id_name
@@ -10,6 +10,7 @@ import pandas as pd
10
10
 
11
11
  from validmind.errors import MissingOrInvalidModelPredictFnError
12
12
  from validmind.logging import get_logger
13
+ from validmind.vm_models.model import ModelTask
13
14
 
14
15
  logger = get_logger(__name__)
15
16
 
@@ -118,8 +119,14 @@ def compute_predictions(model, X, **kwargs) -> tuple:
118
119
  "You can pass `prediction_values` or `prediction_columns` to use precomputed predictions"
119
120
  )
120
121
 
121
- # TODO: this is really not ideal/robust and should not be handled by dataset class
122
- if probability_values is None and _is_probabilties(prediction_values):
122
+ if model.attributes.task is ModelTask.REGRESSION:
123
+ logger.info("Model is configured for regression.")
124
+ return probability_values, prediction_values
125
+
126
+ if probability_values is None and (
127
+ model.attributes.task is ModelTask.CLASSIFICATION
128
+ or _is_probabilties(prediction_values)
129
+ ):
123
130
  logger.info(
124
131
  "Predict method returned probabilities instead of direct labels or regression values. "
125
132
  "This implies the model is likely configured for a classification task with probability output."
@@ -9,6 +9,7 @@ import importlib
9
9
  import inspect
10
10
  from abc import abstractmethod
11
11
  from dataclasses import dataclass
12
+ from enum import Enum
12
13
 
13
14
  from validmind.errors import MissingOrInvalidModelPredictFnError
14
15
 
@@ -38,6 +39,14 @@ R_MODEL_METHODS = [
38
39
  ]
39
40
 
40
41
 
42
+ class ModelTask(Enum):
43
+ """Model task enums"""
44
+
45
+ # TODO: add more tasks
46
+ CLASSIFICATION = "classification"
47
+ REGRESSION = "regression"
48
+
49
+
41
50
  class ModelPipeline:
42
51
  """Helper class for chaining models together
43
52
 
@@ -65,6 +74,7 @@ class ModelAttributes:
65
74
  framework: str = None
66
75
  framework_version: str = None
67
76
  language: str = None
77
+ task: ModelTask = None
68
78
 
69
79
  @classmethod
70
80
  def from_dict(cls, data):
@@ -76,6 +86,7 @@ class ModelAttributes:
76
86
  framework=data.get("framework"),
77
87
  framework_version=data.get("framework_version"),
78
88
  language=data.get("language"),
89
+ task=ModelTask(data.get("task")) if data.get("task") else None,
79
90
  )
80
91
 
81
92
 
@@ -108,7 +119,7 @@ class VMModel(VMInput):
108
119
 
109
120
  self.name = name or self.__class__.__name__
110
121
 
111
- self.attributes = attributes
122
+ self.attributes = attributes or ModelAttributes()
112
123
 
113
124
  # set any additional attributes passed in (likely for subclasses)
114
125
  for key, value in kwargs.items():
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: validmind
3
- Version: 2.5.2
3
+ Version: 2.5.8
4
4
  Summary: ValidMind Developer Framework
5
5
  License: Commercial License
6
6
  Author: Andres Rodriguez
@@ -1,9 +1,9 @@
1
1
  validmind/__init__.py,sha256=UfmzPwUCdUWbWq3zPqqmq4jw0_kfl3hX4U72p_seE4I,3700
2
- validmind/__version__.py,sha256=V-NiKyTdzd5WY2b4iSwaM1JcbOEyZ0IH2WQKBnjN2DI,22
2
+ validmind/__version__.py,sha256=mNA8KAyMUolRKqUZCQp6s1ZGetufDZcybBUJHOyKaZA,22
3
3
  validmind/ai/test_descriptions.py,sha256=Q1Ftus4x5eiVLKWJu7hqPLukBQZzhy-dARqq_6_JWtk,9464
4
4
  validmind/ai/utils.py,sha256=TEXII_S5CpkpczzSyHwTlqLcPMLnPBJWEBR6QFMKh1U,3421
5
5
  validmind/api_client.py,sha256=JZIJWuYtvl-VEVi_AK4c839Fn7cGa40J2d4_4FUZcno,17483
6
- validmind/client.py,sha256=guXu_9um4caPpepbAsfKgjLc63Ygx07Lgp8wZJD3p6Y,18653
6
+ validmind/client.py,sha256=tFqjbTbJ5AVOythRMn5vcoBm3uCKFbV2yPmk-XqForE,18902
7
7
  validmind/client_config.py,sha256=58L6s6-9vFWC9vkSs_98CjV1YWmlksdhblJtPQxQsAk,1611
8
8
  validmind/datasets/__init__.py,sha256=oYfcvW7BAyUgpghBOnTeGbQF6tpFAWg38rRirdLr8m8,262
9
9
  validmind/datasets/classification/__init__.py,sha256=HlTOBLyb6IorRYmAhP3AIyX-l-NyemyDjV8BBOdrCrY,1787
@@ -86,7 +86,7 @@ validmind/test_suites/tabular_datasets.py,sha256=WE4eLzRCfiqAxRqXnZFRR3Lo_u-TI6K
86
86
  validmind/test_suites/text_data.py,sha256=YGVGBB05356jN9Gzcy5CHShRzo1fm5mKsZY7YBq0cYU,739
87
87
  validmind/test_suites/time_series.py,sha256=msUyYySAe5VHJJp6z0k0cNt2ekMB8-XkxGER75Zs1hs,6724
88
88
  validmind/tests/__init__.py,sha256=niYvgTHmjS5E42mJMCrzq1vP8PTKCWxVsqSkAaw2wsE,1036
89
- validmind/tests/__types__.py,sha256=AaPsQrxikIasGshJN5AmKCTzLaZ9d4QBDT1c0Br2sDE,10142
89
+ validmind/tests/__types__.py,sha256=Kgxiyf2djYcKl3ZMg3ND1_f1Hd7Z9VeRTEUnePDDf0U,10085
90
90
  validmind/tests/_store.py,sha256=G604L9g-XIJz8u7BLbHVVVcbx96tDYjAAciaF7wJoiM,2743
91
91
  validmind/tests/data_validation/ACFandPACFPlot.py,sha256=NLoLe-9Z6_41RBee-gRYe4u3kaGojF7ujlyyIk4o3BU,4900
92
92
  validmind/tests/data_validation/ADF.py,sha256=36ZdB8L-hgN0EnYlcxeSsQ3luWip8Qfz_nrYV-1lr74,5113
@@ -190,15 +190,14 @@ validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py,sha2
190
190
  validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py,sha256=npnOPAoXb5FoiwQEwp_gDcbGa5xk4rYnXChTJnuGX64,4405
191
191
  validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py,sha256=qLydyTQ6mzHOYQzqysjPPe_ltiTsRfPEhZDEDm5XxX8,4825
192
192
  validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py,sha256=ereo_dGf19xqvOGz7zcGwhDRU_UqvjFRi5n4KmGDKl8,4511
193
- validmind/tests/model_validation/ragas/AnswerCorrectness.py,sha256=XOEpsVqeaUCGUILu81ZLXSDlyqLStPX9ZKUJDrTh2Bg,5138
194
- validmind/tests/model_validation/ragas/AnswerRelevance.py,sha256=Tz3rNO2PnHvvAw7YKWPUZiiitJTH6VJDG8LjC9Nkwx8,4997
195
- validmind/tests/model_validation/ragas/AnswerSimilarity.py,sha256=O1xq5GNnWB9roscmaBRGiWEi0BvXh705ppeKJTP3O58,4457
196
- validmind/tests/model_validation/ragas/AspectCritique.py,sha256=08jlfL4qVuRM-U4Y-zGYb9iy-DLsXk7JtdvA0yRK498,6263
197
- validmind/tests/model_validation/ragas/ContextEntityRecall.py,sha256=Du3A5Jkpt9_msaF3bDy6tNvgomUkmgwsIxO2zdZmmyA,4904
198
- validmind/tests/model_validation/ragas/ContextPrecision.py,sha256=-4LBSu1ovzIuf2evSoSwyYdrNtGl5-9w8yRQzjcdDtY,4630
199
- validmind/tests/model_validation/ragas/ContextRecall.py,sha256=i72mPF8eO8BUrshdvn6Mpoq9oFSfvPH1lfWk-LKvS8w,4567
200
- validmind/tests/model_validation/ragas/ContextRelevancy.py,sha256=TcfC-O7vj2zDU1UqIYC4KgAQaA9aaOtSUHuXfl70JJE,4155
201
- validmind/tests/model_validation/ragas/Faithfulness.py,sha256=89EeM0lrUq5MAhKYhOO9cnp32WCap6eG2n28SjZH9c4,4525
193
+ validmind/tests/model_validation/ragas/AnswerCorrectness.py,sha256=UhspG4nY901ZhAmgEzABWiYQPx0rKEJqQnrFCunwnN8,5139
194
+ validmind/tests/model_validation/ragas/AnswerRelevance.py,sha256=_hD24Ecs1TZQl-lEoFtdgNGg3hXL-VyfmimiJaovnvY,4998
195
+ validmind/tests/model_validation/ragas/AnswerSimilarity.py,sha256=c1xc4F4gwrrJKn1eEhZQbw1nc39Q2zS75AS9G3XUMAI,4458
196
+ validmind/tests/model_validation/ragas/AspectCritique.py,sha256=2hGGVMb0_va9Gjqyu1OUI-CSpD6k7ICMnwEYEtRGadk,6264
197
+ validmind/tests/model_validation/ragas/ContextEntityRecall.py,sha256=zZGenHhWZQRm9CxAl-ZgbHva6vUlbI_jsFkuY4B2LS8,4905
198
+ validmind/tests/model_validation/ragas/ContextPrecision.py,sha256=FdXTL8KXv6q5lR1BItkCAt105qikYmd89KgvLOqkatE,4631
199
+ validmind/tests/model_validation/ragas/ContextRecall.py,sha256=fNawcRi5M8773mh-QcuUaJsdoLrkCDPza-qvOBMApKk,4568
200
+ validmind/tests/model_validation/ragas/Faithfulness.py,sha256=Jg9SK9NPSbLG9nmM1tu55FGSFpqbb3P4e7kPg20OD_8,4526
202
201
  validmind/tests/model_validation/ragas/utils.py,sha256=zh9_pGitutGBS4Tvk3Bw1D-QVnDueggNErAhAvMPUOA,3130
203
202
  validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py,sha256=KhQroHKDG4gpRAoD9Clw65qNslwGS93rTETdZTOqoTk,2840
204
203
  validmind/tests/model_validation/sklearn/AdjustedRandIndex.py,sha256=nSs1BGC8MSWWp7T6M0FZR5kNPraiZwRJF7U6LNCoMMM,2715
@@ -217,7 +216,7 @@ validmind/tests/model_validation/sklearn/MinimumAccuracy.py,sha256=5KSAd29dbKs3n
217
216
  validmind/tests/model_validation/sklearn/MinimumF1Score.py,sha256=TaLHk98CwQigyt17L1uBBLC25D5J_IKb6a_IFJFO7AE,4618
218
217
  validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py,sha256=Z5JZ4edtzuyneI8qSmGv-OKL2PVq5dg44CwSmePz3OU,5102
219
218
  validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py,sha256=-fGgddsc_0832zTl_gRRsLx2sZWBPB0FdS5YmbluN8s,6132
220
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py,sha256=1nei-MIF7utxuPJXnhTKIaLugUsxk1s4cFob1CR08Yg,13444
219
+ validmind/tests/model_validation/sklearn/OverfitDiagnosis.py,sha256=kB392ZQYqsPCgVDbqZ-056PliVJ_3Txogf-5iF37qgI,12750
221
220
  validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py,sha256=CqMuBuNWzzTtzVcmhAlJHPmtyDO5YuaoXk5hhIXmRuY,4926
222
221
  validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py,sha256=chYVS4OcvSG3YA91N7VDJ4Lh7EDgNEcUM8_k72s13IM,10072
223
222
  validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py,sha256=V0SS06u8DsyaJpL0S14HBPAQwJJYXnvP3fNp2P4CT84,4363
@@ -227,7 +226,7 @@ validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py,sha256=CH
227
226
  validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py,sha256=ELYhY_My1YqS4_i2fnHgL5Dg7vKUIa0wska0bkAFkuU,5737
228
227
  validmind/tests/model_validation/sklearn/RegressionR2Square.py,sha256=Ojm5sz3re4rk17u7xiezn1P_rp7wcA3etKgzdhGYH-s,4906
229
228
  validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py,sha256=tGJKpfeTvU2xBxsYbQSC5GPDcCS2_j0FcT3uceXZduI,2761
230
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py,sha256=ntzu5W4JF6pM8YXwtWVnQHx8zGaZMYSTlQlD38XpCUo,13366
229
+ validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py,sha256=-DyGzQ0PItOISGqtgn2b0WVGG3hycg3lRdgjFM_jPdk,14400
231
230
  validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py,sha256=ECYjHHIz5kfnLi2XlzWOKquRf23_77kdcPK8Xw2qwQk,8887
232
231
  validmind/tests/model_validation/sklearn/SilhouettePlot.py,sha256=6PZ_sqiPBpL4_fyRE_sg0bSWWrDkryh_v-88KK4i3RQ,6185
233
232
  validmind/tests/model_validation/sklearn/TrainingTestDegradation.py,sha256=K3F8Ev7nIaIjwLHC9ljnMp07YwZeqo4RLui5C6IDuR8,7209
@@ -270,7 +269,7 @@ validmind/tests/prompt_validation/Robustness.py,sha256=fBdkYnO9yoBazz4wD-l62tT8D
270
269
  validmind/tests/prompt_validation/Specificity.py,sha256=h3gKRTTi2rfnGWmGC1YnSt2s_VbZU4KX0iY7LciZ3PU,6068
271
270
  validmind/tests/prompt_validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
272
271
  validmind/tests/prompt_validation/ai_powered_test.py,sha256=7TTeIR5GotQosm7oVT8Y3KnwPB3XkVT1Fzhckpr-SgE,1963
273
- validmind/tests/run.py,sha256=WuLV8iY2xN7bRPu5px75-rgRKeh_XYPtbdLhqG8Dugo,15874
272
+ validmind/tests/run.py,sha256=K_EiaquuSv7rVnr-wl2uO1HUDypWGIEXA8JIjw5xsKw,15876
274
273
  validmind/tests/test_providers.py,sha256=47xe5eb5ufvj1jmhdRsbSvDQTXSDpFDFNeXg3xtXwhw,5320
275
274
  validmind/tests/utils.py,sha256=kNrxfUYbj4DwmkZtpp_1rG4GMUGxYEhvqnYR_A7qAKM,471
276
275
  validmind/unit_metrics/__init__.py,sha256=mFk52eU7bOQKTpruKSrPyzjmxFUpIi5RZuwIE5BVFHU,7345
@@ -295,10 +294,10 @@ validmind/utils.py,sha256=DYUB3nig6MJwct5dymhy7Gt9apwzPVipKAWxSrm0-tg,15849
295
294
  validmind/vm_models/__init__.py,sha256=V5DH-E1Rkvl-HQEkilppVCHBag9MQXkzyoORLW3LSGQ,1210
296
295
  validmind/vm_models/dataset/__init__.py,sha256=U4CxZjdoc0dd9u2AqBl5PJh1UVbzXWNrmundmjLF-qE,346
297
296
  validmind/vm_models/dataset/dataset.py,sha256=hBaczQjo-Jb1u6Ma5yX86m6JzT16XndAlq32WbHqVx8,25645
298
- validmind/vm_models/dataset/utils.py,sha256=DRFCg93YE7sTRrWAGt1RIyvzPjINagMk6zUw7z692d0,5325
297
+ validmind/vm_models/dataset/utils.py,sha256=VMcPEgwW9oW5D0MCa_MqXCq_sEzzsLLRmS4RaYrsif0,5530
299
298
  validmind/vm_models/figure.py,sha256=iSrvPcCG5sQrMkX1Fh6c5utRzaroh3bc6IlnGDOK_Eg,6651
300
299
  validmind/vm_models/input.py,sha256=qLdqz_bktr4v0YcPha2vFdDvmkC-btT1pH9zBIkt1OY,1046
301
- validmind/vm_models/model.py,sha256=P-zKbh0TrU_4ZK-bA0l83h6K6nfU6v0lIpC4mfCl6Fw,6115
300
+ validmind/vm_models/model.py,sha256=Dewux_jTgUAXPgHW6ZtJTa8WvH0WkWsryO43DI9HkMU,6409
302
301
  validmind/vm_models/test/metric.py,sha256=DvXMju36JzxArXNWimq3SSrSUoIHkyvDbuhbgBOKxkk,3357
303
302
  validmind/vm_models/test/metric_result.py,sha256=Bak4GDrMlNq5NtgP5exwlPsKZgz3tWgtC6jZqtHjvqM,1987
304
303
  validmind/vm_models/test/output_template.py,sha256=njqCAMyLxwadkCWhACVskyL9-psTgmUysaeeirTVAX4,1500
@@ -312,8 +311,8 @@ validmind/vm_models/test_suite/runner.py,sha256=aewxadRfoOPH48jes2Gtb3Ju_FWFfVM_
312
311
  validmind/vm_models/test_suite/summary.py,sha256=GQRNe2ZvvqjQN0yKmaN7ohAUjRFQIN4YYUYxfOuWN6M,4682
313
312
  validmind/vm_models/test_suite/test.py,sha256=_GfbK36l98SjzgVcucmp0OKBJKqMW3neO7SqJ3EWeps,5049
314
313
  validmind/vm_models/test_suite/test_suite.py,sha256=Cns2wL54v0T5Mv5_HJb3kMeaa4rtycdqT8KxK9_rWEU,6279
315
- validmind-2.5.2.dist-info/LICENSE,sha256=XonPUfwjvrC5Ombl3y-ko0Wubb1xdG_7nzvIbkZRKHw,35772
316
- validmind-2.5.2.dist-info/METADATA,sha256=ohJLci5xsiIdqzeyGW_7WBkC1HkZPx4hoje_IscPds0,4242
317
- validmind-2.5.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
318
- validmind-2.5.2.dist-info/entry_points.txt,sha256=HuW7YyOv9u_OEWpViQXtv0nfoI67uieJHawKWA4Hv9A,76
319
- validmind-2.5.2.dist-info/RECORD,,
314
+ validmind-2.5.8.dist-info/LICENSE,sha256=XonPUfwjvrC5Ombl3y-ko0Wubb1xdG_7nzvIbkZRKHw,35772
315
+ validmind-2.5.8.dist-info/METADATA,sha256=YrAvv1MV1wQ1q4FaqUSvJNVP3ZSC_P9AeY4GY0pFiEI,4242
316
+ validmind-2.5.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
317
+ validmind-2.5.8.dist-info/entry_points.txt,sha256=HuW7YyOv9u_OEWpViQXtv0nfoI67uieJHawKWA4Hv9A,76
318
+ validmind-2.5.8.dist-info/RECORD,,
@@ -1,119 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- import warnings
6
-
7
- import plotly.express as px
8
- from datasets import Dataset
9
-
10
- from validmind import tags, tasks
11
-
12
- from .utils import get_ragas_config, get_renamed_columns
13
-
14
-
15
- @tags("ragas", "llm", "retrieval_performance")
16
- @tasks("text_qa", "text_generation", "text_summarization", "text_classification")
17
- def ContextRelevancy(
18
- dataset,
19
- question_column: str = "question",
20
- contexts_column: str = "contexts",
21
- ):
22
- """
23
- Evaluates the context relevancy metric for entries in a dataset and visualizes the
24
- results.
25
-
26
- This metric gauges the relevancy of the retrieved context, calculated based on both
27
- the `question` and `contexts`. The values fall within the range of (0, 1), with
28
- higher values indicating better relevancy.
29
-
30
- Ideally, the retrieved context should exclusively contain essential information to
31
- address the provided query. To compute this, we initially estimate the value of by
32
- identifying sentences within the retrieved context that are relevant for answering
33
- the given question. The final score is determined by the following formula:
34
-
35
- $$
36
- \\text{context relevancy} = {|S| \\over |\\text{Total number of sentences in retrieved context}|}
37
- $$
38
-
39
- ### Configuring Columns
40
-
41
- This metric requires the following columns in your dataset:
42
- - `question` (str): The text query that was input into the model.
43
- - `contexts` (List[str]): A list of text contexts which are retrieved and which
44
- will be evaluated to make sure they are relevant to the question.
45
-
46
- If the above data is not in the appropriate column, you can specify different column
47
- names for these fields using the parameters `question_column` and `contexts_column`.
48
-
49
- For example, if your dataset has this data stored in different columns, you can
50
- pass the following parameters:
51
- ```python
52
- {
53
- "question_column": "question",
54
- "contexts_column": "context_info"
55
- }
56
- ```
57
-
58
- If the data is stored as a dictionary in another column, specify the column and key
59
- like this:
60
- ```python
61
- pred_col = dataset.prediction_column(model)
62
- params = {
63
- "contexts_column": f"{pred_col}.contexts",
64
- }
65
- ```
66
-
67
- For more complex situations, you can use a function to extract the data:
68
- ```python
69
- pred_col = dataset.prediction_column(model)
70
- params = {
71
- "contexts_column": lambda x: [x[pred_col]["context_message"]],
72
- }
73
- ```
74
- """
75
- try:
76
- from ragas import evaluate
77
- from ragas.metrics import context_relevancy
78
- except ImportError:
79
- raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
80
-
81
- warnings.filterwarnings(
82
- "ignore",
83
- category=FutureWarning,
84
- message="promote has been superseded by promote_options='default'.",
85
- )
86
-
87
- required_columns = {
88
- "question": question_column,
89
- "contexts": contexts_column,
90
- }
91
-
92
- df = get_renamed_columns(dataset.df, required_columns)
93
-
94
- result_df = evaluate(
95
- Dataset.from_pandas(df), metrics=[context_relevancy], **get_ragas_config()
96
- ).to_pandas()
97
-
98
- fig_histogram = px.histogram(x=result_df["context_relevancy"].to_list(), nbins=10)
99
- fig_box = px.box(x=result_df["context_relevancy"].to_list())
100
-
101
- return (
102
- {
103
- "Scores (will not be uploaded to UI)": result_df[
104
- ["question", "contexts", "context_relevancy"]
105
- ],
106
- "Aggregate Scores": [
107
- {
108
- "Mean Score": result_df["context_relevancy"].mean(),
109
- "Median Score": result_df["context_relevancy"].median(),
110
- "Max Score": result_df["context_relevancy"].max(),
111
- "Min Score": result_df["context_relevancy"].min(),
112
- "Standard Deviation": result_df["context_relevancy"].std(),
113
- "Count": len(result_df),
114
- }
115
- ],
116
- },
117
- fig_histogram,
118
- fig_box,
119
- )