validmind 2.0.1__py3-none-any.whl → 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +4 -1
- validmind/__version__.py +1 -1
- validmind/ai.py +197 -0
- validmind/api_client.py +16 -4
- validmind/client.py +23 -3
- validmind/datasets/classification/customer_churn.py +2 -2
- validmind/datasets/nlp/__init__.py +5 -0
- validmind/datasets/nlp/cnn_dailymail.py +98 -0
- validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +255 -0
- validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +1277 -0
- validmind/datasets/nlp/datasets/sentiments_with_predictions.csv +4847 -0
- validmind/errors.py +11 -1
- validmind/models/huggingface.py +2 -2
- validmind/models/pytorch.py +3 -3
- validmind/models/sklearn.py +4 -4
- validmind/tests/__init__.py +47 -9
- validmind/tests/data_validation/DatasetDescription.py +0 -1
- validmind/tests/data_validation/nlp/StopWords.py +1 -6
- validmind/tests/data_validation/nlp/TextDescription.py +20 -9
- validmind/tests/decorator.py +189 -0
- validmind/tests/model_validation/MeteorScore.py +92 -0
- validmind/tests/model_validation/RegardHistogram.py +5 -6
- validmind/tests/model_validation/RegardScore.py +3 -5
- validmind/tests/model_validation/RougeMetrics.py +6 -4
- validmind/tests/model_validation/SelfCheckNLIScore.py +112 -0
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +17 -22
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +3 -1
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +30 -4
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +9 -3
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
- validmind/tests/prompt_validation/ai_powered_test.py +2 -0
- validmind/unit_metrics/__init__.py +0 -2
- validmind/unit_metrics/composite.py +275 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +39 -0
- validmind/unit_metrics/regression/HuberLoss.py +27 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +36 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +22 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +22 -0
- validmind/unit_metrics/regression/QuantileLoss.py +25 -0
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +27 -0
- validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +22 -0
- validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +22 -0
- validmind/unit_metrics/regression/sklearn/RSquaredScore.py +22 -0
- validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +23 -0
- validmind/unit_metrics/sklearn/classification/Accuracy.py +2 -0
- validmind/unit_metrics/sklearn/classification/F1.py +2 -0
- validmind/unit_metrics/sklearn/classification/Precision.py +2 -0
- validmind/unit_metrics/sklearn/classification/ROC_AUC.py +2 -0
- validmind/unit_metrics/sklearn/classification/Recall.py +2 -0
- validmind/utils.py +17 -1
- validmind/vm_models/dataset.py +376 -21
- validmind/vm_models/figure.py +52 -17
- validmind/vm_models/test/metric.py +33 -30
- validmind/vm_models/test/output_template.py +0 -27
- validmind/vm_models/test/result_wrapper.py +57 -24
- validmind/vm_models/test/test.py +2 -1
- validmind/vm_models/test/threshold_test.py +24 -13
- validmind/vm_models/test_context.py +7 -0
- validmind/vm_models/test_suite/runner.py +1 -1
- validmind/vm_models/test_suite/test.py +1 -1
- {validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/METADATA +9 -13
- {validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/RECORD +65 -44
- validmind-2.0.7.dist-info/entry_points.txt +3 -0
- {validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/LICENSE +0 -0
- {validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/WHEEL +0 -0
validmind/errors.py
CHANGED
@@ -48,7 +48,7 @@ class MissingCacheResultsArgumentsError(BaseError):
|
|
48
48
|
pass
|
49
49
|
|
50
50
|
|
51
|
-
class
|
51
|
+
class MissingOrInvalidModelPredictFnError(BaseError):
|
52
52
|
"""
|
53
53
|
When the pytorch model is missing a predict function or its predict
|
54
54
|
method does not have the expected arguments.
|
@@ -315,6 +315,14 @@ class UnsupportedModelError(BaseError):
|
|
315
315
|
pass
|
316
316
|
|
317
317
|
|
318
|
+
class UnsupportedModelForSHAPError(BaseError):
|
319
|
+
"""
|
320
|
+
When an unsupported model is used for SHAP importance.
|
321
|
+
"""
|
322
|
+
|
323
|
+
pass
|
324
|
+
|
325
|
+
|
318
326
|
class SkipTestError(BaseError):
|
319
327
|
"""
|
320
328
|
Useful error to throw when a test cannot be executed.
|
@@ -361,6 +369,8 @@ def should_raise_on_fail_fast(error) -> bool:
|
|
361
369
|
"""
|
362
370
|
error_class = error.__class__.__name__
|
363
371
|
return error_class not in [
|
372
|
+
"MissingOrInvalidModelPredictFnError",
|
364
373
|
"MissingRequiredTestInputError",
|
365
374
|
"SkipTestError",
|
375
|
+
"UnsupportedModelForSHAPError",
|
366
376
|
]
|
validmind/models/huggingface.py
CHANGED
@@ -6,7 +6,7 @@ from dataclasses import dataclass
|
|
6
6
|
|
7
7
|
import pandas as pd
|
8
8
|
|
9
|
-
from validmind.errors import
|
9
|
+
from validmind.errors import MissingOrInvalidModelPredictFnError
|
10
10
|
from validmind.logging import get_logger
|
11
11
|
from validmind.vm_models.model import (
|
12
12
|
ModelAttributes,
|
@@ -44,7 +44,7 @@ class HFModel(VMModel):
|
|
44
44
|
Invoke predict_proba from underline model
|
45
45
|
"""
|
46
46
|
if not has_method_with_arguments(self.model, "predict_proba", 1):
|
47
|
-
raise
|
47
|
+
raise MissingOrInvalidModelPredictFnError(
|
48
48
|
"Model requires a implementation of predict_proba method with 1 argument"
|
49
49
|
+ " that is tensor features matrix"
|
50
50
|
)
|
validmind/models/pytorch.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from validmind.errors import
|
5
|
+
from validmind.errors import MissingOrInvalidModelPredictFnError
|
6
6
|
from validmind.logging import get_logger
|
7
7
|
from validmind.vm_models.model import (
|
8
8
|
ModelAttributes,
|
@@ -41,7 +41,7 @@ class PyTorchModel(VMModel):
|
|
41
41
|
Invoke predict_proba from underline model
|
42
42
|
"""
|
43
43
|
if not has_method_with_arguments(self.model, "predict_proba", 1):
|
44
|
-
raise
|
44
|
+
raise MissingOrInvalidModelPredictFnError(
|
45
45
|
"Model requires a implemention of predict_proba method with 1 argument"
|
46
46
|
+ " that is tensor features matrix"
|
47
47
|
)
|
@@ -54,7 +54,7 @@ class PyTorchModel(VMModel):
|
|
54
54
|
Predict method for the model. This is a wrapper around the model's
|
55
55
|
"""
|
56
56
|
if not has_method_with_arguments(self.model, "predict", 1):
|
57
|
-
raise
|
57
|
+
raise MissingOrInvalidModelPredictFnError(
|
58
58
|
"Model requires a implemention of predict method with 1 argument"
|
59
59
|
+ " that is tensor features matrix"
|
60
60
|
)
|
validmind/models/sklearn.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from validmind.errors import
|
5
|
+
from validmind.errors import MissingOrInvalidModelPredictFnError
|
6
6
|
from validmind.logging import get_logger
|
7
7
|
from validmind.vm_models.model import (
|
8
8
|
ModelAttributes,
|
@@ -40,9 +40,9 @@ class SKlearnModel(VMModel):
|
|
40
40
|
predict_proba (for classification) or predict (for regression) method
|
41
41
|
"""
|
42
42
|
if not has_method_with_arguments(self.model, "predict_proba", 1):
|
43
|
-
raise
|
44
|
-
"
|
45
|
-
+ "
|
43
|
+
raise MissingOrInvalidModelPredictFnError(
|
44
|
+
f"SKlearn model {self.model.__class__} Model does not have a compatible predict_proba implementation."
|
45
|
+
+ " Please assign predictions directly with vm_dataset.assign_predictions(model, prediction_values)"
|
46
46
|
)
|
47
47
|
if callable(getattr(self.model, "predict_proba", None)):
|
48
48
|
return self.model.predict_proba(*args, **kwargs)[:, 1]
|
validmind/tests/__init__.py
CHANGED
@@ -18,6 +18,7 @@ from markdown import markdown
|
|
18
18
|
from ..errors import LoadTestError
|
19
19
|
from ..html_templates.content_blocks import test_content_block_html
|
20
20
|
from ..logging import get_logger
|
21
|
+
from ..unit_metrics.composite import load_composite_metric
|
21
22
|
from ..utils import clean_docstring, format_dataframe, fuzzy_match, test_id_to_name
|
22
23
|
from ..vm_models import TestContext, TestInput
|
23
24
|
from .__types__ import ExternalTestProvider
|
@@ -43,6 +44,7 @@ __tests = None
|
|
43
44
|
__test_classes = None
|
44
45
|
|
45
46
|
__test_providers: Dict[str, ExternalTestProvider] = {}
|
47
|
+
__custom_tests: Dict[str, object] = {}
|
46
48
|
|
47
49
|
|
48
50
|
def _test_description(test_class, truncate=True):
|
@@ -260,13 +262,13 @@ def load_test(test_id, reload=False): # noqa: C901
|
|
260
262
|
error = None
|
261
263
|
namespace = parts[0]
|
262
264
|
|
263
|
-
if
|
264
|
-
|
265
|
-
f"Unable to load test {test_id}. "
|
266
|
-
f"No Test Provider found for the namespace: {namespace}."
|
267
|
-
)
|
265
|
+
if test_id.split(":")[0] in __custom_tests:
|
266
|
+
test = __custom_tests[test_id.split(":")[0]]
|
268
267
|
|
269
|
-
|
268
|
+
elif test_id.startswith("validmind.composite_metric"):
|
269
|
+
test = load_composite_metric(test_id)
|
270
|
+
|
271
|
+
elif namespace == "validmind":
|
270
272
|
test_module = ".".join(parts[1:-1])
|
271
273
|
test_class = parts[-1]
|
272
274
|
|
@@ -284,6 +286,12 @@ def load_test(test_id, reload=False): # noqa: C901
|
|
284
286
|
except AttributeError:
|
285
287
|
error = f"Unable to load test {test_id}. Class not in module: {test_class}"
|
286
288
|
|
289
|
+
elif namespace != "validmind" and namespace not in __test_providers:
|
290
|
+
error = (
|
291
|
+
f"Unable to load test {test_id}. "
|
292
|
+
f"No Test Provider found for the namespace: {namespace}."
|
293
|
+
)
|
294
|
+
|
287
295
|
elif namespace in __test_providers:
|
288
296
|
try:
|
289
297
|
test = __test_providers[namespace].load_test(test_id.split(".", 1)[1])
|
@@ -346,11 +354,24 @@ def describe_test(test_id: str = None, raw: bool = False):
|
|
346
354
|
)
|
347
355
|
|
348
356
|
|
349
|
-
def run_test(
|
357
|
+
def run_test(
|
358
|
+
test_id: str = None,
|
359
|
+
name: str = None,
|
360
|
+
unit_metrics: list = None,
|
361
|
+
params: dict = None,
|
362
|
+
inputs=None,
|
363
|
+
output_template=None,
|
364
|
+
**kwargs,
|
365
|
+
):
|
350
366
|
"""Run a test by test ID
|
351
367
|
|
352
368
|
Args:
|
353
|
-
test_id (str): The test ID
|
369
|
+
test_id (str, option): The test ID to run - required when running a single test
|
370
|
+
i.e. when not running multiple unit metrics
|
371
|
+
name (str, optional): The name of the test (used to create a composite metric
|
372
|
+
out of multiple unit metrics) - required when running multiple unit metrics
|
373
|
+
unit_metrics (list, optional): A list of unit metric IDs to run as a composite
|
374
|
+
metric - required when running multiple unit metrics
|
354
375
|
params (dict, optional): A dictionary of params to override the default params
|
355
376
|
inputs: A dictionary of test inputs to pass to the Test
|
356
377
|
output_template (str, optional): A template to use for customizing the output
|
@@ -360,7 +381,20 @@ def run_test(test_id, params: dict = None, inputs=None, output_template=None, **
|
|
360
381
|
- models: A list of models to use for the test
|
361
382
|
other inputs can be accessed inside the test via `self.inputs["input_name"]`
|
362
383
|
"""
|
363
|
-
|
384
|
+
if not test_id and not name and not unit_metrics:
|
385
|
+
raise ValueError(
|
386
|
+
"`test_id` or `name` and `unit_metrics` must be provided to run a test"
|
387
|
+
)
|
388
|
+
|
389
|
+
if (unit_metrics and not name) or (name and not unit_metrics):
|
390
|
+
raise ValueError("`name` and `unit_metrics` must be provided together")
|
391
|
+
|
392
|
+
if unit_metrics:
|
393
|
+
TestClass = load_composite_metric(unit_metrics=unit_metrics, metric_name=name)
|
394
|
+
test_id = f"validmind.composite_metric.{name}"
|
395
|
+
else:
|
396
|
+
TestClass = load_test(test_id, reload=True)
|
397
|
+
|
364
398
|
test = TestClass(
|
365
399
|
test_id=test_id,
|
366
400
|
context=TestContext(),
|
@@ -383,3 +417,7 @@ def register_test_provider(namespace: str, test_provider: ExternalTestProvider)
|
|
383
417
|
test_provider (ExternalTestProvider): The test provider
|
384
418
|
"""
|
385
419
|
__test_providers[namespace] = test_provider
|
420
|
+
|
421
|
+
|
422
|
+
def _register_custom_test(test_id: str, test_class: object):
|
423
|
+
__custom_tests[test_id] = test_class
|
@@ -22,7 +22,6 @@ from validmind.vm_models import (
|
|
22
22
|
ResultTableMetadata,
|
23
23
|
ThresholdTest,
|
24
24
|
ThresholdTestResult,
|
25
|
-
VMDataset,
|
26
25
|
)
|
27
26
|
|
28
27
|
|
@@ -86,17 +85,13 @@ class StopWords(ThresholdTest):
|
|
86
85
|
ResultTable(
|
87
86
|
data=df,
|
88
87
|
metadata=ResultTableMetadata(
|
89
|
-
title=f"
|
88
|
+
title=f"Stop words results for column '{self.inputs.dataset.target_column}'"
|
90
89
|
),
|
91
90
|
)
|
92
91
|
]
|
93
92
|
)
|
94
93
|
|
95
94
|
def run(self):
|
96
|
-
# Can only run this test if we have a Dataset object
|
97
|
-
if not isinstance(self.inputs.dataset, VMDataset):
|
98
|
-
raise ValueError("ClassImbalance requires a validmind Dataset object")
|
99
|
-
|
100
95
|
text_column = self.inputs.dataset.text_column
|
101
96
|
|
102
97
|
def create_corpus(df, text_column):
|
@@ -92,9 +92,12 @@ class TextDescription(Metric):
|
|
92
92
|
total_words = len(words)
|
93
93
|
total_sentences = len(sentences)
|
94
94
|
avg_sentence_length = round(
|
95
|
-
|
96
|
-
|
97
|
-
|
95
|
+
(
|
96
|
+
sum(len(sentence.split()) for sentence in sentences)
|
97
|
+
/ total_sentences
|
98
|
+
if total_sentences
|
99
|
+
else 0
|
100
|
+
),
|
98
101
|
1,
|
99
102
|
)
|
100
103
|
total_paragraphs = len(paragraphs)
|
@@ -161,9 +164,13 @@ class TextDescription(Metric):
|
|
161
164
|
return combined_df
|
162
165
|
|
163
166
|
def run(self):
|
167
|
+
# Enforce that text_column must be provided as part of the params
|
168
|
+
if self.inputs.dataset.text_column is None:
|
169
|
+
raise ValueError("A 'text_column' must be provided to run this test.")
|
170
|
+
|
164
171
|
# Can only run this test if we have a Dataset object
|
165
172
|
if not isinstance(self.inputs.dataset, VMDataset):
|
166
|
-
raise ValueError("
|
173
|
+
raise ValueError("TextDescription requires a validmind Dataset object")
|
167
174
|
|
168
175
|
df_text_description = self.text_description_table(
|
169
176
|
self.inputs.dataset.df, self.params
|
@@ -177,27 +184,31 @@ class TextDescription(Metric):
|
|
177
184
|
("Total Unique Words", "Lexical Diversity"),
|
178
185
|
]
|
179
186
|
params = {"combinations_to_plot": combinations_to_plot}
|
180
|
-
figures = self.
|
187
|
+
figures = self.text_description_plots(df_text_description, params)
|
181
188
|
|
182
189
|
return self.cache_results(
|
183
190
|
figures=figures,
|
184
191
|
)
|
185
192
|
|
186
193
|
# Function to plot scatter plots for specified combinations using Plotly
|
187
|
-
def
|
194
|
+
def text_description_plots(self, df, params):
|
188
195
|
combinations_to_plot = params["combinations_to_plot"]
|
189
196
|
figures = []
|
190
197
|
# Create hist plots for each column
|
191
198
|
for i, column in enumerate(df.columns):
|
192
199
|
fig = px.histogram(df, x=column)
|
193
200
|
fig.update_layout(bargap=0.2)
|
194
|
-
|
201
|
+
# Generate a unique key for each histogram using the column name and index
|
202
|
+
histogram_key = f"{self.name}_histogram_{column}_{i}"
|
203
|
+
figures.append(Figure(for_object=self, key=histogram_key, figure=fig))
|
195
204
|
|
196
|
-
for metric1, metric2 in combinations_to_plot:
|
205
|
+
for j, (metric1, metric2) in enumerate(combinations_to_plot):
|
197
206
|
fig = px.scatter(
|
198
207
|
df, x=metric1, y=metric2, title=f"Scatter Plot: {metric1} vs {metric2}"
|
199
208
|
)
|
200
|
-
|
209
|
+
# Generate a unique key for each scatter plot using the metric names and index
|
210
|
+
scatter_key = f"{self.name}_scatter_{metric1}_vs_{metric2}_{j}"
|
211
|
+
figures.append(Figure(for_object=self, key=scatter_key, figure=fig))
|
201
212
|
plt.close("all")
|
202
213
|
|
203
214
|
return figures
|
@@ -0,0 +1,189 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
"""Decorators for creating and registering metrics with the ValidMind framework."""
|
6
|
+
|
7
|
+
import inspect
|
8
|
+
from uuid import uuid4
|
9
|
+
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
from validmind.logging import get_logger
|
13
|
+
from validmind.utils import clean_docstring
|
14
|
+
from validmind.vm_models import (
|
15
|
+
Metric,
|
16
|
+
MetricResult,
|
17
|
+
ResultSummary,
|
18
|
+
ResultTable,
|
19
|
+
ResultTableMetadata,
|
20
|
+
)
|
21
|
+
from validmind.vm_models.figure import (
|
22
|
+
Figure,
|
23
|
+
is_matplotlib_figure,
|
24
|
+
is_plotly_figure,
|
25
|
+
is_png_image,
|
26
|
+
)
|
27
|
+
from validmind.vm_models.test.result_wrapper import MetricResultWrapper
|
28
|
+
|
29
|
+
from . import _register_custom_test
|
30
|
+
|
31
|
+
logger = get_logger(__name__)
|
32
|
+
|
33
|
+
|
34
|
+
def _inspect_signature(test_func: callable):
|
35
|
+
input_keys = ["dataset", "datasets", "model", "models"]
|
36
|
+
|
37
|
+
inputs = {}
|
38
|
+
params = {}
|
39
|
+
|
40
|
+
for name, arg in inspect.signature(test_func).parameters.items():
|
41
|
+
if name in input_keys:
|
42
|
+
target_dict = inputs
|
43
|
+
else:
|
44
|
+
target_dict = params
|
45
|
+
|
46
|
+
target_dict[name] = {
|
47
|
+
"type": arg.annotation,
|
48
|
+
"default": (
|
49
|
+
arg.default if arg.default is not inspect.Parameter.empty else None
|
50
|
+
),
|
51
|
+
}
|
52
|
+
|
53
|
+
return inputs, params
|
54
|
+
|
55
|
+
|
56
|
+
def _build_result(results, test_id, description, output_template):
|
57
|
+
ref_id = str(uuid4())
|
58
|
+
figure_metadata = {
|
59
|
+
"_type": "metric",
|
60
|
+
"_name": test_id,
|
61
|
+
"_ref_id": ref_id,
|
62
|
+
}
|
63
|
+
|
64
|
+
tables = []
|
65
|
+
figures = []
|
66
|
+
|
67
|
+
def process_item(item):
|
68
|
+
if is_matplotlib_figure(item) or is_plotly_figure(item) or is_png_image(item):
|
69
|
+
figures.append(
|
70
|
+
Figure(
|
71
|
+
key=f"{test_id}:{len(figures) + 1}",
|
72
|
+
figure=item,
|
73
|
+
metadata=figure_metadata,
|
74
|
+
)
|
75
|
+
)
|
76
|
+
elif isinstance(item, list):
|
77
|
+
tables.append(ResultTable(data=item))
|
78
|
+
elif isinstance(item, pd.DataFrame):
|
79
|
+
tables.append(ResultTable(data=item))
|
80
|
+
elif isinstance(item, dict):
|
81
|
+
for table_name, table in item.items():
|
82
|
+
tables.append(
|
83
|
+
ResultTable(
|
84
|
+
data=table,
|
85
|
+
metadata=ResultTableMetadata(title=table_name),
|
86
|
+
)
|
87
|
+
)
|
88
|
+
else:
|
89
|
+
raise ValueError(f"Invalid return type: {type(item)}")
|
90
|
+
|
91
|
+
# if the results are a tuple, process each item as a separate result
|
92
|
+
if isinstance(results, tuple):
|
93
|
+
for item in results:
|
94
|
+
process_item(item)
|
95
|
+
else:
|
96
|
+
process_item(results)
|
97
|
+
|
98
|
+
return MetricResultWrapper(
|
99
|
+
result_id=test_id,
|
100
|
+
metric=MetricResult(
|
101
|
+
key=test_id,
|
102
|
+
ref_id=ref_id,
|
103
|
+
value="Empty",
|
104
|
+
summary=ResultSummary(results=tables),
|
105
|
+
),
|
106
|
+
figures=figures,
|
107
|
+
result_metadata=[
|
108
|
+
{
|
109
|
+
"content_id": f"metric_description:{test_id}",
|
110
|
+
"text": clean_docstring(description),
|
111
|
+
}
|
112
|
+
],
|
113
|
+
inputs=[],
|
114
|
+
output_template=output_template,
|
115
|
+
)
|
116
|
+
|
117
|
+
|
118
|
+
def get_run_method(func, inputs, params):
|
119
|
+
def run(self: Metric):
|
120
|
+
input_kwargs = {k: getattr(self.inputs, k) for k in inputs.keys()}
|
121
|
+
param_kwargs = {
|
122
|
+
k: self.params.get(k, params[k]["default"]) for k in params.keys()
|
123
|
+
}
|
124
|
+
|
125
|
+
raw_results = func(**input_kwargs, **param_kwargs)
|
126
|
+
|
127
|
+
self.result = _build_result(
|
128
|
+
results=raw_results,
|
129
|
+
test_id=self.test_id,
|
130
|
+
description=self.__doc__,
|
131
|
+
output_template=self.output_template,
|
132
|
+
)
|
133
|
+
|
134
|
+
return self.result
|
135
|
+
|
136
|
+
return run
|
137
|
+
|
138
|
+
|
139
|
+
def metric(func_or_id):
|
140
|
+
"""Decorator for creating and registering metrics with the ValidMind framework.
|
141
|
+
|
142
|
+
Creates a metric object and registers it with ValidMind under the provided ID. If
|
143
|
+
no ID is provided, the function name will be used as to build one. So if the
|
144
|
+
function name is `my_metric`, the metric will be registered under the ID
|
145
|
+
`validmind.custom_metrics.my_metric`.
|
146
|
+
|
147
|
+
This decorator works by creating a new `Metric` class will be created whose `run`
|
148
|
+
method calls the decorated function. This function should take as arguments the
|
149
|
+
inputs it requires (`dataset`, `datasets`, `model`, `models`) followed by any
|
150
|
+
parameters. It can return any number of the following types:
|
151
|
+
|
152
|
+
- Table: Either a list of dictionaries or a pandas DataFrame
|
153
|
+
- Plot: Either a matplotlib figure or a plotly figure
|
154
|
+
|
155
|
+
The function may also include a docstring. This docstring will be used and logged
|
156
|
+
as the metric's description.
|
157
|
+
|
158
|
+
Args:
|
159
|
+
func: The function to decorate
|
160
|
+
test_id: The identifier for the metric. If not provided, the function name is used.
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
The decorated function.
|
164
|
+
"""
|
165
|
+
|
166
|
+
def decorator(func):
|
167
|
+
test_id = func_or_id or f"validmind.custom_metrics.{func.__name__}"
|
168
|
+
|
169
|
+
inputs, params = _inspect_signature(func)
|
170
|
+
description = inspect.getdoc(func)
|
171
|
+
|
172
|
+
metric_class = type(
|
173
|
+
func.__name__,
|
174
|
+
(Metric,),
|
175
|
+
{
|
176
|
+
"run": get_run_method(func, inputs, params),
|
177
|
+
"required_inputs": list(inputs.keys()),
|
178
|
+
"default_parameters": params,
|
179
|
+
"__doc__": description,
|
180
|
+
},
|
181
|
+
)
|
182
|
+
_register_custom_test(test_id, metric_class)
|
183
|
+
|
184
|
+
return func
|
185
|
+
|
186
|
+
if callable(func_or_id):
|
187
|
+
return decorator(func_or_id)
|
188
|
+
|
189
|
+
return decorator
|
@@ -0,0 +1,92 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
|
7
|
+
import evaluate
|
8
|
+
import pandas as pd
|
9
|
+
import plotly.graph_objects as go
|
10
|
+
|
11
|
+
from validmind.vm_models import Figure, Metric
|
12
|
+
|
13
|
+
|
14
|
+
@dataclass
|
15
|
+
class MeteorScore(Metric):
|
16
|
+
"""
|
17
|
+
Computes and visualizes the METEOR score for each text generation instance, assessing translation quality.
|
18
|
+
|
19
|
+
**Purpose**: METEOR (Metric for Evaluation of Translation with Explicit ORdering) is designed to evaluate the quality
|
20
|
+
of machine translations by comparing them against reference translations. It emphasizes both the accuracy and fluency
|
21
|
+
of translations, incorporating precision, recall, and word order into its assessment.
|
22
|
+
|
23
|
+
**Test Mechanism**: The METEOR score is computed for each pair of machine-generated translation (prediction) and its
|
24
|
+
corresponding human-produced reference. This is done by considering unigram matches between the translations, including
|
25
|
+
matches based on surface forms, stemmed forms, and synonyms. The score is a combination of unigram precision and recall,
|
26
|
+
adjusted for word order through a fragmentation penalty.
|
27
|
+
|
28
|
+
**Signs of High Risk**:
|
29
|
+
- Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their human-produced references, highlighting potential deficiencies in both the accuracy and fluency of translations.
|
30
|
+
- Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the translation model processes and reconstructs sentence structures, potentially compromising the natural flow of translated text.
|
31
|
+
- Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader inability of the model to adapt to the nuances of different languages or dialects, pointing towards gaps in its training or inherent limitations.
|
32
|
+
|
33
|
+
**Strengths**:
|
34
|
+
- Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance of
|
35
|
+
content coverage in translations.
|
36
|
+
- Directly accounts for word order, offering a nuanced evaluation of translation fluency beyond simple lexical matching.
|
37
|
+
- Adapts to various forms of lexical similarity, including synonyms and stemmed forms, allowing for flexible matching.
|
38
|
+
|
39
|
+
**Limitations**:
|
40
|
+
- While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for
|
41
|
+
large datasets.
|
42
|
+
- The use of external resources for synonym and stemming matching may introduce variability based on the resources'
|
43
|
+
quality and relevance to the specific translation task.
|
44
|
+
"""
|
45
|
+
|
46
|
+
name = "meteor_score"
|
47
|
+
required_inputs = ["model", "dataset"]
|
48
|
+
|
49
|
+
def run(self):
|
50
|
+
# Load the METEOR metric
|
51
|
+
meteor = evaluate.load("meteor")
|
52
|
+
|
53
|
+
# Initialize a list to hold METEOR scores
|
54
|
+
meteor_scores = []
|
55
|
+
|
56
|
+
for prediction, reference in zip(
|
57
|
+
self.inputs.dataset.y_pred(self.inputs.model.input_id),
|
58
|
+
self.inputs.dataset.y,
|
59
|
+
):
|
60
|
+
# Compute the METEOR score for the current prediction-reference pair
|
61
|
+
result = meteor.compute(predictions=[prediction], references=[reference])
|
62
|
+
meteor_scores.append(result["meteor"])
|
63
|
+
|
64
|
+
# Visualization of METEOR scores
|
65
|
+
figures = self.visualize_scores(meteor_scores)
|
66
|
+
|
67
|
+
return self.cache_results(figures=figures)
|
68
|
+
|
69
|
+
def visualize_scores(self, scores):
|
70
|
+
# Convert the scores list to a DataFrame for plotting
|
71
|
+
scores_df = pd.DataFrame(scores, columns=["METEOR Score"])
|
72
|
+
|
73
|
+
# Create a line plot of the METEOR scores
|
74
|
+
fig = go.Figure()
|
75
|
+
fig.add_trace(
|
76
|
+
go.Scatter(
|
77
|
+
x=scores_df.index,
|
78
|
+
y=scores_df["METEOR Score"],
|
79
|
+
mode="lines+markers",
|
80
|
+
name="METEOR Score",
|
81
|
+
)
|
82
|
+
)
|
83
|
+
fig.update_layout(
|
84
|
+
title="METEOR Scores Across Text Instances",
|
85
|
+
xaxis_title="Text Instance Index",
|
86
|
+
yaxis_title="METEOR Score",
|
87
|
+
)
|
88
|
+
|
89
|
+
# Wrap the Plotly figure for compatibility with your framework
|
90
|
+
figures = [Figure(for_object=self, key=self.key, figure=fig)]
|
91
|
+
|
92
|
+
return figures
|
@@ -58,21 +58,19 @@ class RegardHistogram(Metric):
|
|
58
58
|
|
59
59
|
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
60
60
|
y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
|
61
|
-
input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
|
62
61
|
|
63
|
-
if not len(y_true) == len(y_pred)
|
62
|
+
if not len(y_true) == len(y_pred):
|
64
63
|
raise ValueError(
|
65
|
-
"Inconsistent lengths among
|
64
|
+
"Inconsistent lengths among true summaries and predicted summaries."
|
66
65
|
)
|
67
66
|
|
68
|
-
return
|
67
|
+
return y_true, y_pred
|
69
68
|
|
70
69
|
def regard_histogram(self):
|
71
70
|
regard_tool = evaluate.load("regard")
|
72
|
-
|
71
|
+
y_true, y_pred = self._get_datasets()
|
73
72
|
|
74
73
|
dataframes = {
|
75
|
-
"Input Text": input_text,
|
76
74
|
"Target Text": y_true,
|
77
75
|
"Predicted Summaries": y_pred,
|
78
76
|
}
|
@@ -101,6 +99,7 @@ class RegardHistogram(Metric):
|
|
101
99
|
)
|
102
100
|
|
103
101
|
row_offset = 0
|
102
|
+
|
104
103
|
for column_name, column_data in dataframes.items():
|
105
104
|
results = regard_tool.compute(data=column_data)["regard"]
|
106
105
|
regard_dicts = [
|
@@ -59,21 +59,19 @@ class RegardScore(Metric):
|
|
59
59
|
|
60
60
|
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
61
61
|
y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
|
62
|
-
input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
|
63
62
|
|
64
|
-
if not len(y_true) == len(y_pred)
|
63
|
+
if not len(y_true) == len(y_pred):
|
65
64
|
raise ValueError(
|
66
65
|
"Inconsistent lengths among input text, true summaries, and predicted summaries."
|
67
66
|
)
|
68
67
|
|
69
|
-
return
|
68
|
+
return y_true, y_pred
|
70
69
|
|
71
70
|
def regard_line_plot(self):
|
72
71
|
regard_tool = evaluate.load("regard")
|
73
|
-
|
72
|
+
y_true, y_pred = self._get_datasets()
|
74
73
|
|
75
74
|
dataframes = {
|
76
|
-
"Input Text": input_text,
|
77
75
|
"Target Text": y_true,
|
78
76
|
"Predicted Summaries": y_pred,
|
79
77
|
}
|