validmind 2.0.1__py3-none-any.whl → 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. validmind/__init__.py +4 -1
  2. validmind/__version__.py +1 -1
  3. validmind/ai.py +197 -0
  4. validmind/api_client.py +16 -4
  5. validmind/client.py +23 -3
  6. validmind/datasets/classification/customer_churn.py +2 -2
  7. validmind/datasets/nlp/__init__.py +5 -0
  8. validmind/datasets/nlp/cnn_dailymail.py +98 -0
  9. validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +255 -0
  10. validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +1277 -0
  11. validmind/datasets/nlp/datasets/sentiments_with_predictions.csv +4847 -0
  12. validmind/errors.py +11 -1
  13. validmind/models/huggingface.py +2 -2
  14. validmind/models/pytorch.py +3 -3
  15. validmind/models/sklearn.py +4 -4
  16. validmind/tests/__init__.py +47 -9
  17. validmind/tests/data_validation/DatasetDescription.py +0 -1
  18. validmind/tests/data_validation/nlp/StopWords.py +1 -6
  19. validmind/tests/data_validation/nlp/TextDescription.py +20 -9
  20. validmind/tests/decorator.py +189 -0
  21. validmind/tests/model_validation/MeteorScore.py +92 -0
  22. validmind/tests/model_validation/RegardHistogram.py +5 -6
  23. validmind/tests/model_validation/RegardScore.py +3 -5
  24. validmind/tests/model_validation/RougeMetrics.py +6 -4
  25. validmind/tests/model_validation/SelfCheckNLIScore.py +112 -0
  26. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +17 -22
  27. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +3 -1
  28. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +30 -4
  29. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +9 -3
  30. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  31. validmind/tests/prompt_validation/ai_powered_test.py +2 -0
  32. validmind/unit_metrics/__init__.py +0 -2
  33. validmind/unit_metrics/composite.py +275 -0
  34. validmind/unit_metrics/regression/GiniCoefficient.py +39 -0
  35. validmind/unit_metrics/regression/HuberLoss.py +27 -0
  36. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +36 -0
  37. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +22 -0
  38. validmind/unit_metrics/regression/MeanBiasDeviation.py +22 -0
  39. validmind/unit_metrics/regression/QuantileLoss.py +25 -0
  40. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +27 -0
  41. validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +22 -0
  42. validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +22 -0
  43. validmind/unit_metrics/regression/sklearn/RSquaredScore.py +22 -0
  44. validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +23 -0
  45. validmind/unit_metrics/sklearn/classification/Accuracy.py +2 -0
  46. validmind/unit_metrics/sklearn/classification/F1.py +2 -0
  47. validmind/unit_metrics/sklearn/classification/Precision.py +2 -0
  48. validmind/unit_metrics/sklearn/classification/ROC_AUC.py +2 -0
  49. validmind/unit_metrics/sklearn/classification/Recall.py +2 -0
  50. validmind/utils.py +17 -1
  51. validmind/vm_models/dataset.py +376 -21
  52. validmind/vm_models/figure.py +52 -17
  53. validmind/vm_models/test/metric.py +33 -30
  54. validmind/vm_models/test/output_template.py +0 -27
  55. validmind/vm_models/test/result_wrapper.py +57 -24
  56. validmind/vm_models/test/test.py +2 -1
  57. validmind/vm_models/test/threshold_test.py +24 -13
  58. validmind/vm_models/test_context.py +7 -0
  59. validmind/vm_models/test_suite/runner.py +1 -1
  60. validmind/vm_models/test_suite/test.py +1 -1
  61. {validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/METADATA +9 -13
  62. {validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/RECORD +65 -44
  63. validmind-2.0.7.dist-info/entry_points.txt +3 -0
  64. {validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/LICENSE +0 -0
  65. {validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/WHEEL +0 -0
validmind/errors.py CHANGED
@@ -48,7 +48,7 @@ class MissingCacheResultsArgumentsError(BaseError):
48
48
  pass
49
49
 
50
50
 
51
- class MissingModelPredictFnError(BaseError):
51
+ class MissingOrInvalidModelPredictFnError(BaseError):
52
52
  """
53
53
  When the pytorch model is missing a predict function or its predict
54
54
  method does not have the expected arguments.
@@ -315,6 +315,14 @@ class UnsupportedModelError(BaseError):
315
315
  pass
316
316
 
317
317
 
318
+ class UnsupportedModelForSHAPError(BaseError):
319
+ """
320
+ When an unsupported model is used for SHAP importance.
321
+ """
322
+
323
+ pass
324
+
325
+
318
326
  class SkipTestError(BaseError):
319
327
  """
320
328
  Useful error to throw when a test cannot be executed.
@@ -361,6 +369,8 @@ def should_raise_on_fail_fast(error) -> bool:
361
369
  """
362
370
  error_class = error.__class__.__name__
363
371
  return error_class not in [
372
+ "MissingOrInvalidModelPredictFnError",
364
373
  "MissingRequiredTestInputError",
365
374
  "SkipTestError",
375
+ "UnsupportedModelForSHAPError",
366
376
  ]
@@ -6,7 +6,7 @@ from dataclasses import dataclass
6
6
 
7
7
  import pandas as pd
8
8
 
9
- from validmind.errors import MissingModelPredictFnError
9
+ from validmind.errors import MissingOrInvalidModelPredictFnError
10
10
  from validmind.logging import get_logger
11
11
  from validmind.vm_models.model import (
12
12
  ModelAttributes,
@@ -44,7 +44,7 @@ class HFModel(VMModel):
44
44
  Invoke predict_proba from underline model
45
45
  """
46
46
  if not has_method_with_arguments(self.model, "predict_proba", 1):
47
- raise MissingModelPredictFnError(
47
+ raise MissingOrInvalidModelPredictFnError(
48
48
  "Model requires a implementation of predict_proba method with 1 argument"
49
49
  + " that is tensor features matrix"
50
50
  )
@@ -2,7 +2,7 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from validmind.errors import MissingModelPredictFnError
5
+ from validmind.errors import MissingOrInvalidModelPredictFnError
6
6
  from validmind.logging import get_logger
7
7
  from validmind.vm_models.model import (
8
8
  ModelAttributes,
@@ -41,7 +41,7 @@ class PyTorchModel(VMModel):
41
41
  Invoke predict_proba from underline model
42
42
  """
43
43
  if not has_method_with_arguments(self.model, "predict_proba", 1):
44
- raise MissingModelPredictFnError(
44
+ raise MissingOrInvalidModelPredictFnError(
45
45
  "Model requires a implemention of predict_proba method with 1 argument"
46
46
  + " that is tensor features matrix"
47
47
  )
@@ -54,7 +54,7 @@ class PyTorchModel(VMModel):
54
54
  Predict method for the model. This is a wrapper around the model's
55
55
  """
56
56
  if not has_method_with_arguments(self.model, "predict", 1):
57
- raise MissingModelPredictFnError(
57
+ raise MissingOrInvalidModelPredictFnError(
58
58
  "Model requires a implemention of predict method with 1 argument"
59
59
  + " that is tensor features matrix"
60
60
  )
@@ -2,7 +2,7 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from validmind.errors import MissingModelPredictFnError
5
+ from validmind.errors import MissingOrInvalidModelPredictFnError
6
6
  from validmind.logging import get_logger
7
7
  from validmind.vm_models.model import (
8
8
  ModelAttributes,
@@ -40,9 +40,9 @@ class SKlearnModel(VMModel):
40
40
  predict_proba (for classification) or predict (for regression) method
41
41
  """
42
42
  if not has_method_with_arguments(self.model, "predict_proba", 1):
43
- raise MissingModelPredictFnError(
44
- "Model requires a implemention of predict_proba method with 1 argument"
45
- + " that is features matrix"
43
+ raise MissingOrInvalidModelPredictFnError(
44
+ f"SKlearn model {self.model.__class__} Model does not have a compatible predict_proba implementation."
45
+ + " Please assign predictions directly with vm_dataset.assign_predictions(model, prediction_values)"
46
46
  )
47
47
  if callable(getattr(self.model, "predict_proba", None)):
48
48
  return self.model.predict_proba(*args, **kwargs)[:, 1]
@@ -18,6 +18,7 @@ from markdown import markdown
18
18
  from ..errors import LoadTestError
19
19
  from ..html_templates.content_blocks import test_content_block_html
20
20
  from ..logging import get_logger
21
+ from ..unit_metrics.composite import load_composite_metric
21
22
  from ..utils import clean_docstring, format_dataframe, fuzzy_match, test_id_to_name
22
23
  from ..vm_models import TestContext, TestInput
23
24
  from .__types__ import ExternalTestProvider
@@ -43,6 +44,7 @@ __tests = None
43
44
  __test_classes = None
44
45
 
45
46
  __test_providers: Dict[str, ExternalTestProvider] = {}
47
+ __custom_tests: Dict[str, object] = {}
46
48
 
47
49
 
48
50
  def _test_description(test_class, truncate=True):
@@ -260,13 +262,13 @@ def load_test(test_id, reload=False): # noqa: C901
260
262
  error = None
261
263
  namespace = parts[0]
262
264
 
263
- if namespace != "validmind" and namespace not in __test_providers:
264
- error = (
265
- f"Unable to load test {test_id}. "
266
- f"No Test Provider found for the namespace: {namespace}."
267
- )
265
+ if test_id.split(":")[0] in __custom_tests:
266
+ test = __custom_tests[test_id.split(":")[0]]
268
267
 
269
- if namespace == "validmind":
268
+ elif test_id.startswith("validmind.composite_metric"):
269
+ test = load_composite_metric(test_id)
270
+
271
+ elif namespace == "validmind":
270
272
  test_module = ".".join(parts[1:-1])
271
273
  test_class = parts[-1]
272
274
 
@@ -284,6 +286,12 @@ def load_test(test_id, reload=False): # noqa: C901
284
286
  except AttributeError:
285
287
  error = f"Unable to load test {test_id}. Class not in module: {test_class}"
286
288
 
289
+ elif namespace != "validmind" and namespace not in __test_providers:
290
+ error = (
291
+ f"Unable to load test {test_id}. "
292
+ f"No Test Provider found for the namespace: {namespace}."
293
+ )
294
+
287
295
  elif namespace in __test_providers:
288
296
  try:
289
297
  test = __test_providers[namespace].load_test(test_id.split(".", 1)[1])
@@ -346,11 +354,24 @@ def describe_test(test_id: str = None, raw: bool = False):
346
354
  )
347
355
 
348
356
 
349
- def run_test(test_id, params: dict = None, inputs=None, output_template=None, **kwargs):
357
+ def run_test(
358
+ test_id: str = None,
359
+ name: str = None,
360
+ unit_metrics: list = None,
361
+ params: dict = None,
362
+ inputs=None,
363
+ output_template=None,
364
+ **kwargs,
365
+ ):
350
366
  """Run a test by test ID
351
367
 
352
368
  Args:
353
- test_id (str): The test ID
369
+ test_id (str, option): The test ID to run - required when running a single test
370
+ i.e. when not running multiple unit metrics
371
+ name (str, optional): The name of the test (used to create a composite metric
372
+ out of multiple unit metrics) - required when running multiple unit metrics
373
+ unit_metrics (list, optional): A list of unit metric IDs to run as a composite
374
+ metric - required when running multiple unit metrics
354
375
  params (dict, optional): A dictionary of params to override the default params
355
376
  inputs: A dictionary of test inputs to pass to the Test
356
377
  output_template (str, optional): A template to use for customizing the output
@@ -360,7 +381,20 @@ def run_test(test_id, params: dict = None, inputs=None, output_template=None, **
360
381
  - models: A list of models to use for the test
361
382
  other inputs can be accessed inside the test via `self.inputs["input_name"]`
362
383
  """
363
- TestClass = load_test(test_id, reload=True)
384
+ if not test_id and not name and not unit_metrics:
385
+ raise ValueError(
386
+ "`test_id` or `name` and `unit_metrics` must be provided to run a test"
387
+ )
388
+
389
+ if (unit_metrics and not name) or (name and not unit_metrics):
390
+ raise ValueError("`name` and `unit_metrics` must be provided together")
391
+
392
+ if unit_metrics:
393
+ TestClass = load_composite_metric(unit_metrics=unit_metrics, metric_name=name)
394
+ test_id = f"validmind.composite_metric.{name}"
395
+ else:
396
+ TestClass = load_test(test_id, reload=True)
397
+
364
398
  test = TestClass(
365
399
  test_id=test_id,
366
400
  context=TestContext(),
@@ -383,3 +417,7 @@ def register_test_provider(namespace: str, test_provider: ExternalTestProvider)
383
417
  test_provider (ExternalTestProvider): The test provider
384
418
  """
385
419
  __test_providers[namespace] = test_provider
420
+
421
+
422
+ def _register_custom_test(test_id: str, test_class: object):
423
+ __custom_tests[test_id] = test_class
@@ -122,7 +122,6 @@ class DatasetDescription(Metric):
122
122
  return self.cache_results(results)
123
123
 
124
124
  def infer_datatype(self, df):
125
-
126
125
  vm_dataset_variables = {}
127
126
  typeset = ProfilingTypeSet(Settings())
128
127
  variable_types = typeset.infer_type(df)
@@ -22,7 +22,6 @@ from validmind.vm_models import (
22
22
  ResultTableMetadata,
23
23
  ThresholdTest,
24
24
  ThresholdTestResult,
25
- VMDataset,
26
25
  )
27
26
 
28
27
 
@@ -86,17 +85,13 @@ class StopWords(ThresholdTest):
86
85
  ResultTable(
87
86
  data=df,
88
87
  metadata=ResultTableMetadata(
89
- title=f"Class Imbalance Results for Column {self.inputs.dataset.target_column}"
88
+ title=f"Stop words results for column '{self.inputs.dataset.target_column}'"
90
89
  ),
91
90
  )
92
91
  ]
93
92
  )
94
93
 
95
94
  def run(self):
96
- # Can only run this test if we have a Dataset object
97
- if not isinstance(self.inputs.dataset, VMDataset):
98
- raise ValueError("ClassImbalance requires a validmind Dataset object")
99
-
100
95
  text_column = self.inputs.dataset.text_column
101
96
 
102
97
  def create_corpus(df, text_column):
@@ -92,9 +92,12 @@ class TextDescription(Metric):
92
92
  total_words = len(words)
93
93
  total_sentences = len(sentences)
94
94
  avg_sentence_length = round(
95
- sum(len(sentence.split()) for sentence in sentences) / total_sentences
96
- if total_sentences
97
- else 0,
95
+ (
96
+ sum(len(sentence.split()) for sentence in sentences)
97
+ / total_sentences
98
+ if total_sentences
99
+ else 0
100
+ ),
98
101
  1,
99
102
  )
100
103
  total_paragraphs = len(paragraphs)
@@ -161,9 +164,13 @@ class TextDescription(Metric):
161
164
  return combined_df
162
165
 
163
166
  def run(self):
167
+ # Enforce that text_column must be provided as part of the params
168
+ if self.inputs.dataset.text_column is None:
169
+ raise ValueError("A 'text_column' must be provided to run this test.")
170
+
164
171
  # Can only run this test if we have a Dataset object
165
172
  if not isinstance(self.inputs.dataset, VMDataset):
166
- raise ValueError("TextDescretion requires a validmind Dataset object")
173
+ raise ValueError("TextDescription requires a validmind Dataset object")
167
174
 
168
175
  df_text_description = self.text_description_table(
169
176
  self.inputs.dataset.df, self.params
@@ -177,27 +184,31 @@ class TextDescription(Metric):
177
184
  ("Total Unique Words", "Lexical Diversity"),
178
185
  ]
179
186
  params = {"combinations_to_plot": combinations_to_plot}
180
- figures = self.text_description_scatter_plot(df_text_description, params)
187
+ figures = self.text_description_plots(df_text_description, params)
181
188
 
182
189
  return self.cache_results(
183
190
  figures=figures,
184
191
  )
185
192
 
186
193
  # Function to plot scatter plots for specified combinations using Plotly
187
- def text_description_scatter_plot(self, df, params):
194
+ def text_description_plots(self, df, params):
188
195
  combinations_to_plot = params["combinations_to_plot"]
189
196
  figures = []
190
197
  # Create hist plots for each column
191
198
  for i, column in enumerate(df.columns):
192
199
  fig = px.histogram(df, x=column)
193
200
  fig.update_layout(bargap=0.2)
194
- figures.append(Figure(for_object=self, key=self.key, figure=fig))
201
+ # Generate a unique key for each histogram using the column name and index
202
+ histogram_key = f"{self.name}_histogram_{column}_{i}"
203
+ figures.append(Figure(for_object=self, key=histogram_key, figure=fig))
195
204
 
196
- for metric1, metric2 in combinations_to_plot:
205
+ for j, (metric1, metric2) in enumerate(combinations_to_plot):
197
206
  fig = px.scatter(
198
207
  df, x=metric1, y=metric2, title=f"Scatter Plot: {metric1} vs {metric2}"
199
208
  )
200
- figures.append(Figure(for_object=self, key=self.key, figure=fig))
209
+ # Generate a unique key for each scatter plot using the metric names and index
210
+ scatter_key = f"{self.name}_scatter_{metric1}_vs_{metric2}_{j}"
211
+ figures.append(Figure(for_object=self, key=scatter_key, figure=fig))
201
212
  plt.close("all")
202
213
 
203
214
  return figures
@@ -0,0 +1,189 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ """Decorators for creating and registering metrics with the ValidMind framework."""
6
+
7
+ import inspect
8
+ from uuid import uuid4
9
+
10
+ import pandas as pd
11
+
12
+ from validmind.logging import get_logger
13
+ from validmind.utils import clean_docstring
14
+ from validmind.vm_models import (
15
+ Metric,
16
+ MetricResult,
17
+ ResultSummary,
18
+ ResultTable,
19
+ ResultTableMetadata,
20
+ )
21
+ from validmind.vm_models.figure import (
22
+ Figure,
23
+ is_matplotlib_figure,
24
+ is_plotly_figure,
25
+ is_png_image,
26
+ )
27
+ from validmind.vm_models.test.result_wrapper import MetricResultWrapper
28
+
29
+ from . import _register_custom_test
30
+
31
+ logger = get_logger(__name__)
32
+
33
+
34
+ def _inspect_signature(test_func: callable):
35
+ input_keys = ["dataset", "datasets", "model", "models"]
36
+
37
+ inputs = {}
38
+ params = {}
39
+
40
+ for name, arg in inspect.signature(test_func).parameters.items():
41
+ if name in input_keys:
42
+ target_dict = inputs
43
+ else:
44
+ target_dict = params
45
+
46
+ target_dict[name] = {
47
+ "type": arg.annotation,
48
+ "default": (
49
+ arg.default if arg.default is not inspect.Parameter.empty else None
50
+ ),
51
+ }
52
+
53
+ return inputs, params
54
+
55
+
56
+ def _build_result(results, test_id, description, output_template):
57
+ ref_id = str(uuid4())
58
+ figure_metadata = {
59
+ "_type": "metric",
60
+ "_name": test_id,
61
+ "_ref_id": ref_id,
62
+ }
63
+
64
+ tables = []
65
+ figures = []
66
+
67
+ def process_item(item):
68
+ if is_matplotlib_figure(item) or is_plotly_figure(item) or is_png_image(item):
69
+ figures.append(
70
+ Figure(
71
+ key=f"{test_id}:{len(figures) + 1}",
72
+ figure=item,
73
+ metadata=figure_metadata,
74
+ )
75
+ )
76
+ elif isinstance(item, list):
77
+ tables.append(ResultTable(data=item))
78
+ elif isinstance(item, pd.DataFrame):
79
+ tables.append(ResultTable(data=item))
80
+ elif isinstance(item, dict):
81
+ for table_name, table in item.items():
82
+ tables.append(
83
+ ResultTable(
84
+ data=table,
85
+ metadata=ResultTableMetadata(title=table_name),
86
+ )
87
+ )
88
+ else:
89
+ raise ValueError(f"Invalid return type: {type(item)}")
90
+
91
+ # if the results are a tuple, process each item as a separate result
92
+ if isinstance(results, tuple):
93
+ for item in results:
94
+ process_item(item)
95
+ else:
96
+ process_item(results)
97
+
98
+ return MetricResultWrapper(
99
+ result_id=test_id,
100
+ metric=MetricResult(
101
+ key=test_id,
102
+ ref_id=ref_id,
103
+ value="Empty",
104
+ summary=ResultSummary(results=tables),
105
+ ),
106
+ figures=figures,
107
+ result_metadata=[
108
+ {
109
+ "content_id": f"metric_description:{test_id}",
110
+ "text": clean_docstring(description),
111
+ }
112
+ ],
113
+ inputs=[],
114
+ output_template=output_template,
115
+ )
116
+
117
+
118
+ def get_run_method(func, inputs, params):
119
+ def run(self: Metric):
120
+ input_kwargs = {k: getattr(self.inputs, k) for k in inputs.keys()}
121
+ param_kwargs = {
122
+ k: self.params.get(k, params[k]["default"]) for k in params.keys()
123
+ }
124
+
125
+ raw_results = func(**input_kwargs, **param_kwargs)
126
+
127
+ self.result = _build_result(
128
+ results=raw_results,
129
+ test_id=self.test_id,
130
+ description=self.__doc__,
131
+ output_template=self.output_template,
132
+ )
133
+
134
+ return self.result
135
+
136
+ return run
137
+
138
+
139
+ def metric(func_or_id):
140
+ """Decorator for creating and registering metrics with the ValidMind framework.
141
+
142
+ Creates a metric object and registers it with ValidMind under the provided ID. If
143
+ no ID is provided, the function name will be used as to build one. So if the
144
+ function name is `my_metric`, the metric will be registered under the ID
145
+ `validmind.custom_metrics.my_metric`.
146
+
147
+ This decorator works by creating a new `Metric` class will be created whose `run`
148
+ method calls the decorated function. This function should take as arguments the
149
+ inputs it requires (`dataset`, `datasets`, `model`, `models`) followed by any
150
+ parameters. It can return any number of the following types:
151
+
152
+ - Table: Either a list of dictionaries or a pandas DataFrame
153
+ - Plot: Either a matplotlib figure or a plotly figure
154
+
155
+ The function may also include a docstring. This docstring will be used and logged
156
+ as the metric's description.
157
+
158
+ Args:
159
+ func: The function to decorate
160
+ test_id: The identifier for the metric. If not provided, the function name is used.
161
+
162
+ Returns:
163
+ The decorated function.
164
+ """
165
+
166
+ def decorator(func):
167
+ test_id = func_or_id or f"validmind.custom_metrics.{func.__name__}"
168
+
169
+ inputs, params = _inspect_signature(func)
170
+ description = inspect.getdoc(func)
171
+
172
+ metric_class = type(
173
+ func.__name__,
174
+ (Metric,),
175
+ {
176
+ "run": get_run_method(func, inputs, params),
177
+ "required_inputs": list(inputs.keys()),
178
+ "default_parameters": params,
179
+ "__doc__": description,
180
+ },
181
+ )
182
+ _register_custom_test(test_id, metric_class)
183
+
184
+ return func
185
+
186
+ if callable(func_or_id):
187
+ return decorator(func_or_id)
188
+
189
+ return decorator
@@ -0,0 +1,92 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from dataclasses import dataclass
6
+
7
+ import evaluate
8
+ import pandas as pd
9
+ import plotly.graph_objects as go
10
+
11
+ from validmind.vm_models import Figure, Metric
12
+
13
+
14
+ @dataclass
15
+ class MeteorScore(Metric):
16
+ """
17
+ Computes and visualizes the METEOR score for each text generation instance, assessing translation quality.
18
+
19
+ **Purpose**: METEOR (Metric for Evaluation of Translation with Explicit ORdering) is designed to evaluate the quality
20
+ of machine translations by comparing them against reference translations. It emphasizes both the accuracy and fluency
21
+ of translations, incorporating precision, recall, and word order into its assessment.
22
+
23
+ **Test Mechanism**: The METEOR score is computed for each pair of machine-generated translation (prediction) and its
24
+ corresponding human-produced reference. This is done by considering unigram matches between the translations, including
25
+ matches based on surface forms, stemmed forms, and synonyms. The score is a combination of unigram precision and recall,
26
+ adjusted for word order through a fragmentation penalty.
27
+
28
+ **Signs of High Risk**:
29
+ - Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their human-produced references, highlighting potential deficiencies in both the accuracy and fluency of translations.
30
+ - Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the translation model processes and reconstructs sentence structures, potentially compromising the natural flow of translated text.
31
+ - Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader inability of the model to adapt to the nuances of different languages or dialects, pointing towards gaps in its training or inherent limitations.
32
+
33
+ **Strengths**:
34
+ - Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance of
35
+ content coverage in translations.
36
+ - Directly accounts for word order, offering a nuanced evaluation of translation fluency beyond simple lexical matching.
37
+ - Adapts to various forms of lexical similarity, including synonyms and stemmed forms, allowing for flexible matching.
38
+
39
+ **Limitations**:
40
+ - While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for
41
+ large datasets.
42
+ - The use of external resources for synonym and stemming matching may introduce variability based on the resources'
43
+ quality and relevance to the specific translation task.
44
+ """
45
+
46
+ name = "meteor_score"
47
+ required_inputs = ["model", "dataset"]
48
+
49
+ def run(self):
50
+ # Load the METEOR metric
51
+ meteor = evaluate.load("meteor")
52
+
53
+ # Initialize a list to hold METEOR scores
54
+ meteor_scores = []
55
+
56
+ for prediction, reference in zip(
57
+ self.inputs.dataset.y_pred(self.inputs.model.input_id),
58
+ self.inputs.dataset.y,
59
+ ):
60
+ # Compute the METEOR score for the current prediction-reference pair
61
+ result = meteor.compute(predictions=[prediction], references=[reference])
62
+ meteor_scores.append(result["meteor"])
63
+
64
+ # Visualization of METEOR scores
65
+ figures = self.visualize_scores(meteor_scores)
66
+
67
+ return self.cache_results(figures=figures)
68
+
69
+ def visualize_scores(self, scores):
70
+ # Convert the scores list to a DataFrame for plotting
71
+ scores_df = pd.DataFrame(scores, columns=["METEOR Score"])
72
+
73
+ # Create a line plot of the METEOR scores
74
+ fig = go.Figure()
75
+ fig.add_trace(
76
+ go.Scatter(
77
+ x=scores_df.index,
78
+ y=scores_df["METEOR Score"],
79
+ mode="lines+markers",
80
+ name="METEOR Score",
81
+ )
82
+ )
83
+ fig.update_layout(
84
+ title="METEOR Scores Across Text Instances",
85
+ xaxis_title="Text Instance Index",
86
+ yaxis_title="METEOR Score",
87
+ )
88
+
89
+ # Wrap the Plotly figure for compatibility with your framework
90
+ figures = [Figure(for_object=self, key=self.key, figure=fig)]
91
+
92
+ return figures
@@ -58,21 +58,19 @@ class RegardHistogram(Metric):
58
58
 
59
59
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
60
60
  y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
61
- input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
62
61
 
63
- if not len(y_true) == len(y_pred) == len(input_text):
62
+ if not len(y_true) == len(y_pred):
64
63
  raise ValueError(
65
- "Inconsistent lengths among input text, true summaries, and predicted summaries."
64
+ "Inconsistent lengths among true summaries and predicted summaries."
66
65
  )
67
66
 
68
- return input_text, y_true, y_pred
67
+ return y_true, y_pred
69
68
 
70
69
  def regard_histogram(self):
71
70
  regard_tool = evaluate.load("regard")
72
- input_text, y_true, y_pred = self._get_datasets()
71
+ y_true, y_pred = self._get_datasets()
73
72
 
74
73
  dataframes = {
75
- "Input Text": input_text,
76
74
  "Target Text": y_true,
77
75
  "Predicted Summaries": y_pred,
78
76
  }
@@ -101,6 +99,7 @@ class RegardHistogram(Metric):
101
99
  )
102
100
 
103
101
  row_offset = 0
102
+
104
103
  for column_name, column_data in dataframes.items():
105
104
  results = regard_tool.compute(data=column_data)["regard"]
106
105
  regard_dicts = [
@@ -59,21 +59,19 @@ class RegardScore(Metric):
59
59
 
60
60
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
61
61
  y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
62
- input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
63
62
 
64
- if not len(y_true) == len(y_pred) == len(input_text):
63
+ if not len(y_true) == len(y_pred):
65
64
  raise ValueError(
66
65
  "Inconsistent lengths among input text, true summaries, and predicted summaries."
67
66
  )
68
67
 
69
- return input_text, y_true, y_pred
68
+ return y_true, y_pred
70
69
 
71
70
  def regard_line_plot(self):
72
71
  regard_tool = evaluate.load("regard")
73
- input_text, y_true, y_pred = self._get_datasets()
72
+ y_true, y_pred = self._get_datasets()
74
73
 
75
74
  dataframes = {
76
- "Input Text": input_text,
77
75
  "Target Text": y_true,
78
76
  "Predicted Summaries": y_pred,
79
77
  }