EuroEval 15.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +72 -0
- euroeval/benchmark_config_factory.py +358 -0
- euroeval/benchmark_modules/__init__.py +7 -0
- euroeval/benchmark_modules/base.py +354 -0
- euroeval/benchmark_modules/fresh.py +286 -0
- euroeval/benchmark_modules/hf.py +1185 -0
- euroeval/benchmark_modules/litellm.py +905 -0
- euroeval/benchmark_modules/vllm.py +1171 -0
- euroeval/benchmarker.py +1074 -0
- euroeval/callbacks.py +72 -0
- euroeval/cli.py +281 -0
- euroeval/constants.py +50 -0
- euroeval/data_loading.py +96 -0
- euroeval/data_models.py +474 -0
- euroeval/dataset_configs.py +2001 -0
- euroeval/enums.py +144 -0
- euroeval/exceptions.py +191 -0
- euroeval/finetuning.py +324 -0
- euroeval/generation.py +296 -0
- euroeval/human_evaluation.py +737 -0
- euroeval/languages.py +200 -0
- euroeval/model_cache.py +253 -0
- euroeval/model_config.py +77 -0
- euroeval/model_loading.py +78 -0
- euroeval/scores.py +90 -0
- euroeval/speed_benchmark.py +124 -0
- euroeval/task_utils/__init__.py +1 -0
- euroeval/task_utils/multiple_choice_classification.py +176 -0
- euroeval/task_utils/question_answering.py +698 -0
- euroeval/task_utils/sequence_classification.py +237 -0
- euroeval/task_utils/text_to_text.py +150 -0
- euroeval/task_utils/token_classification.py +464 -0
- euroeval/tasks.py +202 -0
- euroeval/types.py +97 -0
- euroeval/utils.py +574 -0
- euroeval-15.2.0.dist-info/METADATA +234 -0
- euroeval-15.2.0.dist-info/RECORD +40 -0
- euroeval-15.2.0.dist-info/WHEEL +4 -0
- euroeval-15.2.0.dist-info/entry_points.txt +4 -0
- euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""Utility functions related to the sequence-classification task group."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
import typing as t
|
|
6
|
+
|
|
7
|
+
import evaluate
|
|
8
|
+
import Levenshtein
|
|
9
|
+
import numpy as np
|
|
10
|
+
from evaluate import EvaluationModule
|
|
11
|
+
|
|
12
|
+
from ..data_models import BenchmarkConfig, GenerativeModelOutput
|
|
13
|
+
from ..utils import log_once, raise_if_model_output_contains_nan_values
|
|
14
|
+
|
|
15
|
+
if t.TYPE_CHECKING:
|
|
16
|
+
from ..data_models import DatasetConfig
|
|
17
|
+
from ..types import Labels, Predictions
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger("euroeval")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def compute_metrics(
|
|
24
|
+
model_outputs_and_labels: tuple["Predictions", "Labels"],
|
|
25
|
+
dataset_config: "DatasetConfig",
|
|
26
|
+
benchmark_config: "BenchmarkConfig",
|
|
27
|
+
) -> dict[str, float]:
|
|
28
|
+
"""Compute the metrics needed for evaluation.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
model_outputs_and_labels:
|
|
32
|
+
The first sequence contains the model outputs and the second sequence
|
|
33
|
+
contains the true labels.
|
|
34
|
+
dataset_config:
|
|
35
|
+
The configuration of the dataset.
|
|
36
|
+
benchmark_config:
|
|
37
|
+
The configuration of the benchmark.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
A dictionary with the names of the metrics as keys and the metric values as
|
|
41
|
+
values.
|
|
42
|
+
"""
|
|
43
|
+
model_outputs, labels = model_outputs_and_labels
|
|
44
|
+
label2id = {label: idx for idx, label in dataset_config.id2label.items()}
|
|
45
|
+
raise_if_model_output_contains_nan_values(model_output=model_outputs)
|
|
46
|
+
|
|
47
|
+
metrics = {
|
|
48
|
+
metric_cfg.name: (
|
|
49
|
+
evaluate.load(
|
|
50
|
+
path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
|
|
51
|
+
)
|
|
52
|
+
if metric_cfg.huggingface_id != ""
|
|
53
|
+
else None
|
|
54
|
+
)
|
|
55
|
+
for metric_cfg in dataset_config.task.metrics
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
model_output_dtype = np.asarray(model_outputs).dtype
|
|
59
|
+
if model_output_dtype in [np.float16, np.float32, np.float64]:
|
|
60
|
+
predictions = np.asarray(model_outputs).argmax(axis=-1)
|
|
61
|
+
else:
|
|
62
|
+
predictions = model_outputs
|
|
63
|
+
|
|
64
|
+
prompt_label_to_label_mapping = {
|
|
65
|
+
prompt_label: label
|
|
66
|
+
for label, prompt_label in dataset_config.prompt_label_mapping.items()
|
|
67
|
+
}
|
|
68
|
+
predictions = [
|
|
69
|
+
(
|
|
70
|
+
label2id[prompt_label_to_label_mapping[pred.lower()]]
|
|
71
|
+
if isinstance(pred, str)
|
|
72
|
+
else pred
|
|
73
|
+
)
|
|
74
|
+
for pred in predictions
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
label_ids = [
|
|
78
|
+
label2id[label.lower()] if isinstance(label, str) else label for label in labels
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
results: dict[str, float] = dict()
|
|
82
|
+
for cfg in dataset_config.task.metrics:
|
|
83
|
+
metric = metrics[cfg.name]
|
|
84
|
+
assert isinstance(metric, EvaluationModule)
|
|
85
|
+
score_dict: dict[str, float] | None = metric.compute(
|
|
86
|
+
predictions=predictions, references=label_ids, **cfg.compute_kwargs
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# The metric returns None if we are running on multi-GPU and the current
|
|
90
|
+
# process is not the main process
|
|
91
|
+
if score_dict is not None:
|
|
92
|
+
scores = score_dict[cfg.results_key]
|
|
93
|
+
if isinstance(scores, list):
|
|
94
|
+
scores = sum(scores) / len(scores)
|
|
95
|
+
results[cfg.name] = scores
|
|
96
|
+
|
|
97
|
+
return results
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def extract_labels_from_generation(
|
|
101
|
+
input_batch: dict[str, list],
|
|
102
|
+
model_output: GenerativeModelOutput,
|
|
103
|
+
dataset_config: "DatasetConfig",
|
|
104
|
+
) -> list[str]:
|
|
105
|
+
"""Extract the predicted labels from the generated output.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
input_batch:
|
|
109
|
+
The input batch, where the keys are the feature names and the values
|
|
110
|
+
are lists with the feature values.
|
|
111
|
+
model_output:
|
|
112
|
+
The raw generated output of the model.
|
|
113
|
+
dataset_config:
|
|
114
|
+
The configuration of the dataset.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
The predicted labels.
|
|
118
|
+
"""
|
|
119
|
+
if model_output.scores is not None:
|
|
120
|
+
return get_closest_logprobs_labels(
|
|
121
|
+
generation_logprobs=model_output.scores, dataset_config=dataset_config
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
return get_closest_word_edit_labels(
|
|
125
|
+
generated_sequences=model_output.sequences, dataset_config=dataset_config
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def get_closest_logprobs_labels(
|
|
130
|
+
generation_logprobs: list[list[list[tuple[str, float]]]],
|
|
131
|
+
dataset_config: "DatasetConfig",
|
|
132
|
+
) -> list[str]:
|
|
133
|
+
"""Get the labels with the highest predicted logprob value.
|
|
134
|
+
|
|
135
|
+
In case a candidate label is split into multiple tokens, we only use the first
|
|
136
|
+
token to compute the logprob value. E.g., if the candidate label "positive" is
|
|
137
|
+
tokenised as ["pos", "itive"], we only use the logprob value of "pos" to
|
|
138
|
+
represent the logprob value of the entire label.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
generation_logprobs:
|
|
142
|
+
The logprobs of the generated tokens, for all samples in the batch. Of shape
|
|
143
|
+
(batch_size, num_tokens, num_logprobs).
|
|
144
|
+
dataset_config:
|
|
145
|
+
The configuration of the dataset.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
The predicted labels.
|
|
149
|
+
|
|
150
|
+
Raises:
|
|
151
|
+
InvalidBenchmark:
|
|
152
|
+
If no candidate label can be found for any of the generated labels.
|
|
153
|
+
"""
|
|
154
|
+
english_labels = list(dataset_config.id2label.values())
|
|
155
|
+
english2local = dataset_config.prompt_label_mapping
|
|
156
|
+
candidate_labels = [
|
|
157
|
+
english2local[lbl].lower() for lbl in english_labels
|
|
158
|
+
] + english_labels
|
|
159
|
+
|
|
160
|
+
output_labels: list[str] = list()
|
|
161
|
+
for sample in generation_logprobs:
|
|
162
|
+
for logprob_list in sample:
|
|
163
|
+
generated_labels = [
|
|
164
|
+
re.sub(
|
|
165
|
+
pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
|
|
166
|
+
repl="",
|
|
167
|
+
string=label.lower(),
|
|
168
|
+
)
|
|
169
|
+
for label, _ in logprob_list
|
|
170
|
+
]
|
|
171
|
+
generated_labels = [label for label in generated_labels if label != ""]
|
|
172
|
+
|
|
173
|
+
# We want to use the first generated label which starts with a candidate
|
|
174
|
+
# label, as the output label
|
|
175
|
+
output_label: str | None = None
|
|
176
|
+
for generated_label in generated_labels:
|
|
177
|
+
candidate_output_labels = [
|
|
178
|
+
candidate_label
|
|
179
|
+
for candidate_label in candidate_labels
|
|
180
|
+
if candidate_label.startswith(generated_label)
|
|
181
|
+
]
|
|
182
|
+
if candidate_output_labels:
|
|
183
|
+
output_label = candidate_output_labels[0]
|
|
184
|
+
break
|
|
185
|
+
|
|
186
|
+
if output_label is not None:
|
|
187
|
+
output_label = english2local.get(output_label, output_label)
|
|
188
|
+
output_labels.append(output_label)
|
|
189
|
+
break
|
|
190
|
+
else:
|
|
191
|
+
if len(sample) == 0:
|
|
192
|
+
log_once(
|
|
193
|
+
"The model outputted an empty string, so no candidate labels could "
|
|
194
|
+
f"be determined. Using {candidate_labels[0]!r} as the output "
|
|
195
|
+
"label.",
|
|
196
|
+
level=logging.DEBUG,
|
|
197
|
+
)
|
|
198
|
+
else:
|
|
199
|
+
log_once(
|
|
200
|
+
"Could not find a candidate label for any of the generated "
|
|
201
|
+
f"labels in the sample {sample}. Using {candidate_labels[0]!r} "
|
|
202
|
+
"as the output label.",
|
|
203
|
+
level=logging.DEBUG,
|
|
204
|
+
)
|
|
205
|
+
output_labels.append(candidate_labels[0])
|
|
206
|
+
|
|
207
|
+
assert len(output_labels) == len(generation_logprobs)
|
|
208
|
+
return output_labels
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def get_closest_word_edit_labels(
|
|
212
|
+
generated_sequences: list[str], dataset_config: "DatasetConfig"
|
|
213
|
+
) -> list[str]:
|
|
214
|
+
"""Get the labels with the smallest edit distance to the predicted labels.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
generated_sequences:
|
|
218
|
+
The generated sequences from the model.
|
|
219
|
+
dataset_config:
|
|
220
|
+
The configuration of the dataset.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
The candidate labels with the smallest edit distance to the predicted labels.
|
|
224
|
+
"""
|
|
225
|
+
candidate_labels = [
|
|
226
|
+
dataset_config.prompt_label_mapping[lbl]
|
|
227
|
+
for lbl in dataset_config.id2label.values()
|
|
228
|
+
]
|
|
229
|
+
new_predicted_labels: list[str] = list()
|
|
230
|
+
for predicted_label in generated_sequences:
|
|
231
|
+
edit_distances = [
|
|
232
|
+
Levenshtein.distance(s1=predicted_label.lower(), s2=candidate_label.lower())
|
|
233
|
+
for candidate_label in candidate_labels
|
|
234
|
+
]
|
|
235
|
+
closest_label = candidate_labels[np.argmin(edit_distances).item()]
|
|
236
|
+
new_predicted_labels.append(closest_label)
|
|
237
|
+
return new_predicted_labels
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Utility functions related to the text-to-text task group."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import typing as t
|
|
5
|
+
|
|
6
|
+
import evaluate
|
|
7
|
+
import numpy as np
|
|
8
|
+
from evaluate import EvaluationModule
|
|
9
|
+
|
|
10
|
+
from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
|
|
11
|
+
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
12
|
+
from ..exceptions import InvalidBenchmark
|
|
13
|
+
from ..utils import (
|
|
14
|
+
HiddenPrints,
|
|
15
|
+
clear_memory,
|
|
16
|
+
raise_if_model_output_contains_nan_values,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
if t.TYPE_CHECKING:
|
|
20
|
+
from ..types import Labels, Predictions
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger("euroeval")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def compute_metrics(
|
|
27
|
+
model_outputs_and_labels: tuple["Predictions", "Labels"],
|
|
28
|
+
dataset_config: "DatasetConfig",
|
|
29
|
+
benchmark_config: "BenchmarkConfig",
|
|
30
|
+
) -> dict[str, float]:
|
|
31
|
+
"""Compute the metrics needed for evaluation.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
model_outputs_and_labels:
|
|
35
|
+
The first sequence contains the model outputs and the second sequence
|
|
36
|
+
contains the true labels.
|
|
37
|
+
dataset_config:
|
|
38
|
+
The configuration of the dataset.
|
|
39
|
+
benchmark_config:
|
|
40
|
+
The configuration of the benchmark.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
A dictionary with the names of the metrics as keys and the metric values as
|
|
44
|
+
values.
|
|
45
|
+
"""
|
|
46
|
+
model_outputs, labels = model_outputs_and_labels
|
|
47
|
+
raise_if_model_output_contains_nan_values(model_output=model_outputs)
|
|
48
|
+
|
|
49
|
+
metrics = {
|
|
50
|
+
metric_cfg.name: (
|
|
51
|
+
evaluate.load(
|
|
52
|
+
path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
|
|
53
|
+
)
|
|
54
|
+
if metric_cfg.huggingface_id != ""
|
|
55
|
+
else None
|
|
56
|
+
)
|
|
57
|
+
for metric_cfg in dataset_config.task.metrics
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
model_output_dtype = np.asarray(model_outputs).dtype
|
|
61
|
+
output_is_prob = model_output_dtype in [np.float16, np.float32, np.float64]
|
|
62
|
+
if output_is_prob:
|
|
63
|
+
predictions = np.asarray(model_outputs).argmax(axis=-1)
|
|
64
|
+
else:
|
|
65
|
+
predictions = model_outputs
|
|
66
|
+
|
|
67
|
+
results: dict[str, float] = dict()
|
|
68
|
+
for cfg in dataset_config.task.metrics:
|
|
69
|
+
metric = metrics[cfg.name]
|
|
70
|
+
assert isinstance(metric, EvaluationModule)
|
|
71
|
+
|
|
72
|
+
# Some metrics can be computed on hardware accelerators. In this case we
|
|
73
|
+
# start by setting the device to the same device as the model
|
|
74
|
+
if cfg.compute_kwargs.get("device", None) == "auto":
|
|
75
|
+
cfg.compute_kwargs["device"] = benchmark_config.device.type
|
|
76
|
+
|
|
77
|
+
while True:
|
|
78
|
+
try:
|
|
79
|
+
with HiddenPrints():
|
|
80
|
+
score_dict: dict[str, float] | None = metric.compute(
|
|
81
|
+
predictions=predictions, references=labels, **cfg.compute_kwargs
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Clear the cache of the BERTScorer to avoid memory leaks
|
|
85
|
+
for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
|
|
86
|
+
if hasattr(metric, attribute):
|
|
87
|
+
delattr(metric, attribute)
|
|
88
|
+
|
|
89
|
+
clear_memory()
|
|
90
|
+
break
|
|
91
|
+
except Exception as e:
|
|
92
|
+
# Clear the cache of the BERTScorer to avoid memory leaks
|
|
93
|
+
if hasattr(metric, "cached_bertscorer"):
|
|
94
|
+
del metric.cached_bertscorer
|
|
95
|
+
clear_memory()
|
|
96
|
+
|
|
97
|
+
oom_error = [
|
|
98
|
+
"CUDA out of memory",
|
|
99
|
+
"CUDA error",
|
|
100
|
+
"MPS backend out of memory",
|
|
101
|
+
]
|
|
102
|
+
if not any(error in str(e) for error in oom_error):
|
|
103
|
+
raise InvalidBenchmark(str(e))
|
|
104
|
+
|
|
105
|
+
if cfg.compute_kwargs.get("batch_size", 1) > 1:
|
|
106
|
+
batch_size = cfg.compute_kwargs["batch_size"]
|
|
107
|
+
cfg.compute_kwargs["batch_size"] = batch_size // 2
|
|
108
|
+
logger.debug(
|
|
109
|
+
"Out of memory error occurred during the computation of "
|
|
110
|
+
f"the metric {cfg.pretty_name}. Reducing the batch size to "
|
|
111
|
+
f"{cfg.compute_kwargs['batch_size']}."
|
|
112
|
+
)
|
|
113
|
+
elif cfg.compute_kwargs.get("device", "cpu") != "cpu":
|
|
114
|
+
cfg.compute_kwargs["batch_size"] = 32
|
|
115
|
+
cfg.compute_kwargs["device"] = "cpu"
|
|
116
|
+
logger.debug(
|
|
117
|
+
"Out of memory error occurred during the computation of "
|
|
118
|
+
f"the metric {cfg.pretty_name}. Moving the computation to "
|
|
119
|
+
"the CPU."
|
|
120
|
+
)
|
|
121
|
+
else:
|
|
122
|
+
raise InvalidBenchmark(str(e))
|
|
123
|
+
|
|
124
|
+
# The metric returns None if we are running on multi-GPU and the current
|
|
125
|
+
# process is not the main process
|
|
126
|
+
if score_dict is not None:
|
|
127
|
+
scores = score_dict[cfg.results_key]
|
|
128
|
+
if isinstance(scores, list):
|
|
129
|
+
scores = sum(scores) / len(scores)
|
|
130
|
+
results[cfg.name] = scores
|
|
131
|
+
|
|
132
|
+
return results
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def extract_labels_from_generation(
|
|
136
|
+
input_batch: dict[str, list], model_output: "GenerativeModelOutput"
|
|
137
|
+
) -> list[t.Any]:
|
|
138
|
+
"""Extract the predicted labels from the generated output.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
input_batch:
|
|
142
|
+
The input batch, where the keys are the feature names and the values
|
|
143
|
+
are lists with the feature values.
|
|
144
|
+
model_output:
|
|
145
|
+
The raw generated output of the model.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
The predicted labels.
|
|
149
|
+
"""
|
|
150
|
+
return model_output.sequences
|