validmind 2.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. validmind/__init__.py +6 -3
  2. validmind/__version__.py +1 -1
  3. validmind/ai.py +193 -0
  4. validmind/api_client.py +45 -31
  5. validmind/client.py +33 -6
  6. validmind/datasets/classification/customer_churn.py +2 -2
  7. validmind/datasets/credit_risk/__init__.py +11 -0
  8. validmind/datasets/credit_risk/datasets/lending_club_loan_data_2007_2014_clean.csv.gz +0 -0
  9. validmind/datasets/credit_risk/lending_club.py +394 -0
  10. validmind/datasets/nlp/__init__.py +5 -0
  11. validmind/datasets/nlp/cnn_dailymail.py +98 -0
  12. validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +255 -0
  13. validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +1277 -0
  14. validmind/datasets/nlp/datasets/sentiments_with_predictions.csv +4847 -0
  15. validmind/errors.py +11 -1
  16. validmind/logging.py +9 -2
  17. validmind/models/huggingface.py +2 -2
  18. validmind/models/pytorch.py +3 -3
  19. validmind/models/sklearn.py +4 -4
  20. validmind/template.py +2 -2
  21. validmind/test_suites/__init__.py +4 -2
  22. validmind/tests/__init__.py +130 -45
  23. validmind/tests/data_validation/DatasetDescription.py +0 -1
  24. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +3 -1
  25. validmind/tests/data_validation/PiTCreditScoresHistogram.py +1 -1
  26. validmind/tests/data_validation/ScatterPlot.py +8 -2
  27. validmind/tests/data_validation/nlp/StopWords.py +1 -6
  28. validmind/tests/data_validation/nlp/TextDescription.py +20 -9
  29. validmind/tests/decorator.py +313 -0
  30. validmind/tests/model_validation/BertScore.py +1 -1
  31. validmind/tests/model_validation/BertScoreAggregate.py +1 -1
  32. validmind/tests/model_validation/BleuScore.py +1 -1
  33. validmind/tests/model_validation/ClusterSizeDistribution.py +1 -1
  34. validmind/tests/model_validation/ContextualRecall.py +1 -1
  35. validmind/tests/model_validation/FeaturesAUC.py +110 -0
  36. validmind/tests/model_validation/MeteorScore.py +92 -0
  37. validmind/tests/model_validation/RegardHistogram.py +6 -7
  38. validmind/tests/model_validation/RegardScore.py +4 -6
  39. validmind/tests/model_validation/RegressionResidualsPlot.py +127 -0
  40. validmind/tests/model_validation/RougeMetrics.py +7 -5
  41. validmind/tests/model_validation/RougeMetricsAggregate.py +1 -1
  42. validmind/tests/model_validation/SelfCheckNLIScore.py +112 -0
  43. validmind/tests/model_validation/TokenDisparity.py +1 -1
  44. validmind/tests/model_validation/ToxicityHistogram.py +1 -1
  45. validmind/tests/model_validation/ToxicityScore.py +1 -1
  46. validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
  47. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +1 -3
  48. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +17 -22
  49. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +1 -1
  50. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +16 -17
  51. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +1 -1
  52. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  53. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +21 -3
  54. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +1 -1
  55. validmind/tests/model_validation/sklearn/MinimumF1Score.py +1 -1
  56. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +1 -1
  57. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +5 -4
  58. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +2 -2
  59. validmind/tests/model_validation/sklearn/ROCCurve.py +6 -12
  60. validmind/tests/model_validation/sklearn/RegressionErrors.py +2 -2
  61. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +6 -4
  62. validmind/tests/model_validation/sklearn/RegressionR2Square.py +2 -2
  63. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +55 -5
  64. validmind/tests/model_validation/sklearn/SilhouettePlot.py +1 -1
  65. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +11 -5
  66. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +2 -2
  67. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +140 -0
  68. validmind/tests/model_validation/statsmodels/GINITable.py +22 -45
  69. validmind/tests/model_validation/statsmodels/{LogisticRegPredictionHistogram.py → PredictionProbabilitiesHistogram.py} +67 -92
  70. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +2 -2
  71. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -2
  72. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  73. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  74. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  75. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +2 -2
  76. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +128 -0
  77. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +70 -103
  78. validmind/tests/prompt_validation/ai_powered_test.py +2 -0
  79. validmind/tests/test_providers.py +14 -124
  80. validmind/unit_metrics/__init__.py +75 -70
  81. validmind/unit_metrics/classification/sklearn/Accuracy.py +14 -0
  82. validmind/unit_metrics/classification/sklearn/F1.py +13 -0
  83. validmind/unit_metrics/classification/sklearn/Precision.py +13 -0
  84. validmind/unit_metrics/classification/sklearn/ROC_AUC.py +13 -0
  85. validmind/unit_metrics/classification/sklearn/Recall.py +13 -0
  86. validmind/unit_metrics/composite.py +228 -0
  87. validmind/unit_metrics/regression/GiniCoefficient.py +33 -0
  88. validmind/unit_metrics/regression/HuberLoss.py +23 -0
  89. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +30 -0
  90. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +16 -0
  91. validmind/unit_metrics/regression/MeanBiasDeviation.py +13 -0
  92. validmind/unit_metrics/regression/QuantileLoss.py +15 -0
  93. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +21 -0
  94. validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +13 -0
  95. validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +13 -0
  96. validmind/unit_metrics/regression/sklearn/RSquaredScore.py +13 -0
  97. validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +20 -0
  98. validmind/utils.py +20 -31
  99. validmind/vm_models/__init__.py +0 -2
  100. validmind/vm_models/dataset.py +623 -29
  101. validmind/vm_models/figure.py +52 -17
  102. validmind/vm_models/test/metric.py +33 -31
  103. validmind/vm_models/test/output_template.py +0 -27
  104. validmind/vm_models/test/result_wrapper.py +68 -36
  105. validmind/vm_models/test/test.py +4 -2
  106. validmind/vm_models/test/threshold_test.py +24 -14
  107. validmind/vm_models/test_context.py +7 -0
  108. validmind/vm_models/test_suite/runner.py +1 -1
  109. validmind/vm_models/test_suite/summary.py +3 -3
  110. validmind/vm_models/test_suite/test.py +1 -1
  111. validmind/vm_models/test_suite/test_suite.py +2 -1
  112. {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/METADATA +18 -18
  113. {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/RECORD +116 -94
  114. validmind-2.1.0.dist-info/entry_points.txt +3 -0
  115. validmind/tests/__types__.py +0 -62
  116. validmind/tests/model_validation/statsmodels/LogRegressionConfusionMatrix.py +0 -128
  117. validmind/tests/model_validation/statsmodels/LogisticRegCumulativeProb.py +0 -172
  118. validmind/tests/model_validation/statsmodels/ScorecardBucketHistogram.py +0 -181
  119. validmind/tests/model_validation/statsmodels/ScorecardProbabilitiesHistogram.py +0 -175
  120. validmind/unit_metrics/sklearn/classification/Accuracy.py +0 -20
  121. validmind/unit_metrics/sklearn/classification/F1.py +0 -22
  122. validmind/unit_metrics/sklearn/classification/Precision.py +0 -22
  123. validmind/unit_metrics/sklearn/classification/ROC_AUC.py +0 -20
  124. validmind/unit_metrics/sklearn/classification/Recall.py +0 -20
  125. validmind/vm_models/test/unit_metric.py +0 -88
  126. {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/LICENSE +0 -0
  127. {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,313 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ """Decorators for creating and registering metrics with the ValidMind framework."""
6
+
7
+ # TODO: as we move entirely to a functional approach a lot of this logic
8
+ # should be moved into the __init__ to replace the old class-based stuff
9
+
10
+ import inspect
11
+ import os
12
+ from uuid import uuid4
13
+
14
+ import pandas as pd
15
+
16
+ from validmind.errors import MissingRequiredTestInputError
17
+ from validmind.logging import get_logger
18
+ from validmind.vm_models import (
19
+ Metric,
20
+ MetricResult,
21
+ ResultSummary,
22
+ ResultTable,
23
+ ResultTableMetadata,
24
+ )
25
+ from validmind.vm_models.figure import (
26
+ Figure,
27
+ is_matplotlib_figure,
28
+ is_plotly_figure,
29
+ is_png_image,
30
+ )
31
+ from validmind.vm_models.test.result_wrapper import MetricResultWrapper
32
+
33
+ logger = get_logger(__name__)
34
+
35
+
36
+ def _inspect_signature(test_func: callable):
37
+ input_keys = ["dataset", "datasets", "model", "models"]
38
+
39
+ inputs = {}
40
+ params = {}
41
+
42
+ for name, arg in inspect.signature(test_func).parameters.items():
43
+ if name in input_keys:
44
+ target_dict = inputs
45
+ else:
46
+ target_dict = params
47
+
48
+ target_dict[name] = {
49
+ "type": arg.annotation,
50
+ "default": (
51
+ arg.default if arg.default is not inspect.Parameter.empty else None
52
+ ),
53
+ }
54
+
55
+ return inputs, params
56
+
57
+
58
+ def _build_result(results, test_id, description, output_template, inputs): # noqa: C901
59
+ ref_id = str(uuid4())
60
+ figure_metadata = {
61
+ "_type": "metric",
62
+ "_name": test_id,
63
+ "_ref_id": ref_id,
64
+ }
65
+
66
+ tables = []
67
+ figures = []
68
+
69
+ def process_item(item):
70
+ # TOOD: build out a more robust/extensible system for this
71
+ # TODO: custom type handlers would be really cool
72
+
73
+ # unit metrics (scalar values) - show in a simple table for now
74
+ if isinstance(item, int) or isinstance(item, float) or isinstance(item, str):
75
+ tables.append(ResultTable(data=[{test_id.split(".")[-1]: item}]))
76
+
77
+ # plots
78
+ elif isinstance(item, Figure):
79
+ figures.append(item)
80
+ elif is_matplotlib_figure(item) or is_plotly_figure(item) or is_png_image(item):
81
+ figures.append(
82
+ Figure(
83
+ key=f"{test_id}:{len(figures) + 1}",
84
+ figure=item,
85
+ metadata=figure_metadata,
86
+ )
87
+ )
88
+
89
+ # tables
90
+ elif isinstance(item, list) or isinstance(item, pd.DataFrame):
91
+ tables.append(ResultTable(data=item))
92
+ elif isinstance(item, dict):
93
+ for table_name, table in item.items():
94
+ if not isinstance(table, list) and not isinstance(table, pd.DataFrame):
95
+ raise ValueError(
96
+ f"Invalid table format: {table_name} must be a list or DataFrame"
97
+ )
98
+
99
+ tables.append(
100
+ ResultTable(
101
+ data=table,
102
+ metadata=ResultTableMetadata(title=table_name),
103
+ )
104
+ )
105
+
106
+ else:
107
+ raise ValueError(f"Invalid return type: {type(item)}")
108
+
109
+ # if the results are a tuple, process each item as a separate result
110
+ if isinstance(results, tuple):
111
+ for item in results:
112
+ process_item(item)
113
+ else:
114
+ process_item(results)
115
+
116
+ return MetricResultWrapper(
117
+ result_id=test_id,
118
+ metric=MetricResult(
119
+ key=test_id,
120
+ ref_id=ref_id,
121
+ value="Empty",
122
+ summary=ResultSummary(results=tables),
123
+ ),
124
+ figures=figures,
125
+ result_metadata=[
126
+ {
127
+ "content_id": f"metric_description:{test_id}",
128
+ "text": description,
129
+ }
130
+ ],
131
+ inputs=inputs,
132
+ output_template=output_template,
133
+ )
134
+
135
+
136
+ def _get_run_method(func, inputs, params):
137
+ def run(self: Metric):
138
+ input_kwargs = {}
139
+ for k in inputs.keys():
140
+ try:
141
+ input_kwargs[k] = getattr(self.inputs, k)
142
+ except AttributeError:
143
+ raise MissingRequiredTestInputError(f"Missing required input: {k}.")
144
+
145
+ param_kwargs = {
146
+ k: self.params.get(k, params[k]["default"]) for k in params.keys()
147
+ }
148
+
149
+ raw_results = func(**input_kwargs, **param_kwargs)
150
+
151
+ self.result = _build_result(
152
+ results=raw_results,
153
+ test_id=self.test_id,
154
+ description=inspect.getdoc(self),
155
+ output_template=self.output_template,
156
+ inputs=list(inputs.keys()),
157
+ )
158
+
159
+ return self.result
160
+
161
+ return run
162
+
163
+
164
+ def _get_save_func(func, test_id):
165
+ def save(root_folder=".", imports=None):
166
+ parts = test_id.split(".")
167
+
168
+ if len(parts) > 1:
169
+ path = os.path.join(root_folder, *parts[1:-1])
170
+ test_name = parts[-1]
171
+ new_test_id = f"<test_provider_namespace>.{'.'.join(parts[1:])}"
172
+ else:
173
+ path = root_folder
174
+ test_name = parts[0]
175
+ new_test_id = f"<test_provider_namespace>.{test_name}"
176
+
177
+ if not os.path.exists(path):
178
+ os.makedirs(path, exist_ok=True)
179
+
180
+ full_path = os.path.join(path, f"{test_name}.py")
181
+
182
+ source = inspect.getsource(func)
183
+ # remove decorator line
184
+ source = source.split("\n", 1)[1]
185
+ if imports:
186
+ imports = "\n".join(imports)
187
+ source = f"{imports}\n\n\n{source}"
188
+ # add comment to the top of the file
189
+ source = f"""
190
+ # Saved from {func.__module__}.{func.__name__}
191
+ # Original Test ID: {test_id}
192
+ # New Test ID: {new_test_id}
193
+
194
+ {source}
195
+ """
196
+
197
+ # ensure that the function name matches the test name
198
+ source = source.replace(f"def {func.__name__}", f"def {test_name}")
199
+
200
+ # use black to format the code
201
+ try:
202
+ import black
203
+
204
+ source = black.format_str(source, mode=black.FileMode())
205
+ except ImportError:
206
+ # ignore if not available
207
+ pass
208
+
209
+ with open(full_path, "w") as file:
210
+ file.writelines(source)
211
+
212
+ logger.info(
213
+ f"Saved to {os.path.abspath(full_path)}!"
214
+ "Be sure to add any necessary imports to the top of the file."
215
+ )
216
+ logger.info(
217
+ f"This metric can be run with the ID: {new_test_id}",
218
+ )
219
+
220
+ return save
221
+
222
+
223
+ def metric(func_or_id):
224
+ """Decorator for creating and registering metrics with the ValidMind framework.
225
+
226
+ Creates a metric object and registers it with ValidMind under the provided ID. If
227
+ no ID is provided, the function name will be used as to build one. So if the
228
+ function name is `my_metric`, the metric will be registered under the ID
229
+ `validmind.custom_metrics.my_metric`.
230
+
231
+ This decorator works by creating a new `Metric` class will be created whose `run`
232
+ method calls the decorated function. This function should take as arguments the
233
+ inputs it requires (`dataset`, `datasets`, `model`, `models`) followed by any
234
+ parameters. It can return any number of the following types:
235
+
236
+ - Table: Either a list of dictionaries or a pandas DataFrame
237
+ - Plot: Either a matplotlib figure or a plotly figure
238
+ - Scalar: A single number or string
239
+
240
+ The function may also include a docstring. This docstring will be used and logged
241
+ as the metric's description.
242
+
243
+ Args:
244
+ func: The function to decorate
245
+ test_id: The identifier for the metric. If not provided, the function name is used.
246
+
247
+ Returns:
248
+ The decorated function.
249
+ """
250
+
251
+ from . import _register_custom_test
252
+
253
+ def decorator(func):
254
+ test_id = func_or_id or f"validmind.custom_metrics.{func.__name__}"
255
+
256
+ inputs, params = _inspect_signature(func)
257
+ description = inspect.getdoc(func)
258
+ tasks = getattr(func, "__tasks__", [])
259
+ tags = getattr(func, "__tags__", [])
260
+
261
+ metric_class = type(
262
+ func.__name__,
263
+ (Metric,),
264
+ {
265
+ "run": _get_run_method(func, inputs, params),
266
+ "required_inputs": list(inputs.keys()),
267
+ "default_parameters": params,
268
+ "__doc__": description,
269
+ "metadata": {
270
+ "task_types": tasks,
271
+ "tags": tags,
272
+ },
273
+ },
274
+ )
275
+ _register_custom_test(test_id, metric_class)
276
+
277
+ # special function to allow the function to be saved to a file
278
+ func.save = _get_save_func(func, test_id)
279
+
280
+ return func
281
+
282
+ if callable(func_or_id):
283
+ return decorator(func_or_id)
284
+
285
+ return decorator
286
+
287
+
288
+ def tasks(*tasks):
289
+ """Decorator for specifying the task types that a metric is designed for.
290
+
291
+ Args:
292
+ *tasks: The task types that the metric is designed for.
293
+ """
294
+
295
+ def decorator(func):
296
+ func.__tasks__ = list(tasks)
297
+ return func
298
+
299
+ return decorator
300
+
301
+
302
+ def tags(*tags):
303
+ """Decorator for specifying tags for a metric.
304
+
305
+ Args:
306
+ *tags: The tags to apply to the metric.
307
+ """
308
+
309
+ def decorator(func):
310
+ func.__tags__ = list(tags)
311
+ return func
312
+
313
+ return decorator
@@ -57,7 +57,7 @@ class BertScore(Metric):
57
57
 
58
58
  def run(self):
59
59
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
60
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
60
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
61
61
 
62
62
  # Load the bert evaluation metric
63
63
  bert = evaluate.load("bertscore")
@@ -50,7 +50,7 @@ class BertScoreAggregate(Metric):
50
50
 
51
51
  def run(self):
52
52
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
53
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
53
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
54
54
 
55
55
  bert = evaluate.load("bertscore")
56
56
  bert_s = bert.compute(predictions=y_pred, references=y_true, lang="en")
@@ -55,7 +55,7 @@ class BleuScore(Metric):
55
55
 
56
56
  # Compute the BLEU score
57
57
  bleu = bleu.compute(
58
- predictions=self.inputs.dataset.y_pred(self.inputs.model.input_id),
58
+ predictions=self.inputs.dataset.y_pred(self.inputs.model),
59
59
  references=self.inputs.dataset.y,
60
60
  )
61
61
  return self.cache_results(metric_value={"blue_score_metric": bleu})
@@ -61,7 +61,7 @@ class ClusterSizeDistribution(Metric):
61
61
 
62
62
  def run(self):
63
63
  y_true_train = self.inputs.dataset.y
64
- y_pred_train = self.inputs.dataset.y_pred(self.inputs.model.input_id)
64
+ y_pred_train = self.inputs.dataset.y_pred(self.inputs.model)
65
65
  y_true_train = y_true_train.astype(y_pred_train.dtype)
66
66
  df = pd.DataFrame(
67
67
  {"Actual": y_true_train.ravel(), "Prediction": y_pred_train.ravel()}
@@ -66,7 +66,7 @@ class ContextualRecall(Metric):
66
66
 
67
67
  def run(self):
68
68
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
69
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
69
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
70
70
 
71
71
  score_list = []
72
72
  for y_t, y_p in zip(y_true, y_pred):
@@ -0,0 +1,110 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from dataclasses import dataclass
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.graph_objects as go
10
+ from sklearn.metrics import roc_auc_score
11
+
12
+ from validmind.errors import SkipTestError
13
+ from validmind.logging import get_logger
14
+ from validmind.vm_models import Figure, Metric
15
+
16
+ logger = get_logger(__name__)
17
+
18
+
19
+ @dataclass
20
+ class FeaturesAUC(Metric):
21
+ """
22
+ Evaluates the discriminatory power of each individual feature within a binary classification model by calculating the Area Under the Curve (AUC) for each feature separately.
23
+
24
+ **Purpose**: The central objective of this metric is to quantify how well each feature on its own can differentiate between the two classes in a binary classification problem. It serves as a univariate analysis tool that can help in pre-modeling feature selection or post-modeling interpretation.
25
+
26
+ **Test Mechanism**: For each feature, the metric treats the feature values as raw scores to compute the AUC against the actual binary outcomes. It provides an AUC value for each feature, offering a simple yet powerful indication of each feature's univariate classification strength.
27
+
28
+ **Signs of High Risk**:
29
+ - A feature with a low AUC score may not be contributing significantly to the differentiation between the two classes, which could be a concern if it is expected to be predictive.
30
+ - Conversely, a surprisingly high AUC for a feature not believed to be informative may suggest data leakage or other issues with the data.
31
+
32
+ **Strengths**:
33
+ - By isolating each feature, it highlights the individual contribution of features to the classification task without the influence of other variables.
34
+ - Useful for both initial feature evaluation and for providing insights into the model's reliance on individual features after model training.
35
+
36
+ **Limitations**:
37
+ - Does not reflect the combined effects of features or any interaction between them, which can be critical in certain models.
38
+ - The AUC values are calculated without considering the model's use of the features, which could lead to different interpretations of feature importance when considering the model holistically.
39
+ - This metric is applicable only to binary classification tasks and cannot be directly extended to multiclass classification or regression without modifications.
40
+ """
41
+
42
+ name = "features_auc"
43
+ required_inputs = ["model", "dataset"]
44
+ default_params = {
45
+ "fontsize": 12,
46
+ "figure_height": 500,
47
+ }
48
+ metadata = {
49
+ "task_types": ["classification"],
50
+ "tags": [
51
+ "feature_importance",
52
+ "AUC",
53
+ "visualization",
54
+ ],
55
+ }
56
+
57
+ def run(self):
58
+ x = self.inputs.dataset.x_df()
59
+ y = self.inputs.dataset.y_df()
60
+
61
+ if y.nunique() != 2:
62
+ raise SkipTestError("FeaturesAUC metric requires a binary target variable.")
63
+
64
+ aucs = pd.DataFrame(index=x.columns, columns=["AUC"])
65
+
66
+ for column in x.columns:
67
+ feature_values = x[column]
68
+ if feature_values.nunique() > 1:
69
+ auc_score = roc_auc_score(y, feature_values)
70
+ aucs.loc[column, "AUC"] = auc_score
71
+ else:
72
+ aucs.loc[
73
+ column, "AUC"
74
+ ] = np.nan # Not enough unique values to calculate AUC
75
+
76
+ # Sorting the AUC scores in descending order
77
+ sorted_indices = aucs["AUC"].dropna().sort_values(ascending=False).index
78
+
79
+ # Plotting the results
80
+ fig = go.Figure()
81
+ fig.add_trace(
82
+ go.Bar(
83
+ y=[column for column in sorted_indices],
84
+ x=[aucs.loc[column, "AUC"] for column in sorted_indices],
85
+ orientation="h",
86
+ )
87
+ )
88
+ fig.update_layout(
89
+ title_text="Feature AUC Scores",
90
+ yaxis=dict(
91
+ tickmode="linear",
92
+ dtick=1,
93
+ tickfont=dict(size=self.params["fontsize"]),
94
+ title="Features",
95
+ autorange="reversed", # Ensure that the highest AUC is at the top
96
+ ),
97
+ xaxis=dict(title="AUC"),
98
+ height=self.params["figure_height"],
99
+ )
100
+
101
+ return self.cache_results(
102
+ metric_value=aucs.to_dict(),
103
+ figures=[
104
+ Figure(
105
+ for_object=self,
106
+ key="features_auc",
107
+ figure=fig,
108
+ ),
109
+ ],
110
+ )
@@ -0,0 +1,92 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from dataclasses import dataclass
6
+
7
+ import evaluate
8
+ import pandas as pd
9
+ import plotly.graph_objects as go
10
+
11
+ from validmind.vm_models import Figure, Metric
12
+
13
+
14
+ @dataclass
15
+ class MeteorScore(Metric):
16
+ """
17
+ Computes and visualizes the METEOR score for each text generation instance, assessing translation quality.
18
+
19
+ **Purpose**: METEOR (Metric for Evaluation of Translation with Explicit ORdering) is designed to evaluate the quality
20
+ of machine translations by comparing them against reference translations. It emphasizes both the accuracy and fluency
21
+ of translations, incorporating precision, recall, and word order into its assessment.
22
+
23
+ **Test Mechanism**: The METEOR score is computed for each pair of machine-generated translation (prediction) and its
24
+ corresponding human-produced reference. This is done by considering unigram matches between the translations, including
25
+ matches based on surface forms, stemmed forms, and synonyms. The score is a combination of unigram precision and recall,
26
+ adjusted for word order through a fragmentation penalty.
27
+
28
+ **Signs of High Risk**:
29
+ - Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their human-produced references, highlighting potential deficiencies in both the accuracy and fluency of translations.
30
+ - Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the translation model processes and reconstructs sentence structures, potentially compromising the natural flow of translated text.
31
+ - Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader inability of the model to adapt to the nuances of different languages or dialects, pointing towards gaps in its training or inherent limitations.
32
+
33
+ **Strengths**:
34
+ - Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance of
35
+ content coverage in translations.
36
+ - Directly accounts for word order, offering a nuanced evaluation of translation fluency beyond simple lexical matching.
37
+ - Adapts to various forms of lexical similarity, including synonyms and stemmed forms, allowing for flexible matching.
38
+
39
+ **Limitations**:
40
+ - While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for
41
+ large datasets.
42
+ - The use of external resources for synonym and stemming matching may introduce variability based on the resources'
43
+ quality and relevance to the specific translation task.
44
+ """
45
+
46
+ name = "meteor_score"
47
+ required_inputs = ["model", "dataset"]
48
+
49
+ def run(self):
50
+ # Load the METEOR metric
51
+ meteor = evaluate.load("meteor")
52
+
53
+ # Initialize a list to hold METEOR scores
54
+ meteor_scores = []
55
+
56
+ for prediction, reference in zip(
57
+ self.inputs.dataset.y_pred(self.inputs.model),
58
+ self.inputs.dataset.y,
59
+ ):
60
+ # Compute the METEOR score for the current prediction-reference pair
61
+ result = meteor.compute(predictions=[prediction], references=[reference])
62
+ meteor_scores.append(result["meteor"])
63
+
64
+ # Visualization of METEOR scores
65
+ figures = self.visualize_scores(meteor_scores)
66
+
67
+ return self.cache_results(figures=figures)
68
+
69
+ def visualize_scores(self, scores):
70
+ # Convert the scores list to a DataFrame for plotting
71
+ scores_df = pd.DataFrame(scores, columns=["METEOR Score"])
72
+
73
+ # Create a line plot of the METEOR scores
74
+ fig = go.Figure()
75
+ fig.add_trace(
76
+ go.Scatter(
77
+ x=scores_df.index,
78
+ y=scores_df["METEOR Score"],
79
+ mode="lines+markers",
80
+ name="METEOR Score",
81
+ )
82
+ )
83
+ fig.update_layout(
84
+ title="METEOR Scores Across Text Instances",
85
+ xaxis_title="Text Instance Index",
86
+ yaxis_title="METEOR Score",
87
+ )
88
+
89
+ # Wrap the Plotly figure for compatibility with your framework
90
+ figures = [Figure(for_object=self, key=self.key, figure=fig)]
91
+
92
+ return figures
@@ -57,22 +57,20 @@ class RegardHistogram(Metric):
57
57
  raise AttributeError("The 'model' attribute is missing.")
58
58
 
59
59
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
60
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
61
- input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
60
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
62
61
 
63
- if not len(y_true) == len(y_pred) == len(input_text):
62
+ if not len(y_true) == len(y_pred):
64
63
  raise ValueError(
65
- "Inconsistent lengths among input text, true summaries, and predicted summaries."
64
+ "Inconsistent lengths among true summaries and predicted summaries."
66
65
  )
67
66
 
68
- return input_text, y_true, y_pred
67
+ return y_true, y_pred
69
68
 
70
69
  def regard_histogram(self):
71
70
  regard_tool = evaluate.load("regard")
72
- input_text, y_true, y_pred = self._get_datasets()
71
+ y_true, y_pred = self._get_datasets()
73
72
 
74
73
  dataframes = {
75
- "Input Text": input_text,
76
74
  "Target Text": y_true,
77
75
  "Predicted Summaries": y_pred,
78
76
  }
@@ -101,6 +99,7 @@ class RegardHistogram(Metric):
101
99
  )
102
100
 
103
101
  row_offset = 0
102
+
104
103
  for column_name, column_data in dataframes.items():
105
104
  results = regard_tool.compute(data=column_data)["regard"]
106
105
  regard_dicts = [
@@ -58,22 +58,20 @@ class RegardScore(Metric):
58
58
  raise AttributeError("The 'model' attribute is missing.")
59
59
 
60
60
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
61
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
62
- input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
61
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
63
62
 
64
- if not len(y_true) == len(y_pred) == len(input_text):
63
+ if not len(y_true) == len(y_pred):
65
64
  raise ValueError(
66
65
  "Inconsistent lengths among input text, true summaries, and predicted summaries."
67
66
  )
68
67
 
69
- return input_text, y_true, y_pred
68
+ return y_true, y_pred
70
69
 
71
70
  def regard_line_plot(self):
72
71
  regard_tool = evaluate.load("regard")
73
- input_text, y_true, y_pred = self._get_datasets()
72
+ y_true, y_pred = self._get_datasets()
74
73
 
75
74
  dataframes = {
76
- "Input Text": input_text,
77
75
  "Target Text": y_true,
78
76
  "Predicted Summaries": y_pred,
79
77
  }