EuroEval 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +3 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +323 -193
- euroeval/benchmark_modules/vllm.py +166 -112
- euroeval/benchmarker.py +59 -33
- euroeval/cli.py +3 -3
- euroeval/constants.py +13 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +53 -7
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +38 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +8 -7
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +46 -14
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +234 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +17 -6
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +96 -23
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +47 -75
- euroeval/tasks.py +31 -6
- euroeval/tokenization_utils.py +295 -207
- euroeval/utils.py +118 -34
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +12 -14
- euroeval-16.0.0.dist-info/RECORD +69 -0
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -468
- euroeval-15.15.0.dist-info/RECORD +0 -63
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
euroeval/scores.py
CHANGED
|
@@ -52,7 +52,12 @@ def log_scores(
|
|
|
52
52
|
test_se, test_se_str = metric.postprocessing_fn(test_se)
|
|
53
53
|
total_dict[f"test_{metric.name}"] = test_score
|
|
54
54
|
total_dict[f"test_{metric.name}_se"] = test_se
|
|
55
|
-
|
|
55
|
+
log_str = (
|
|
56
|
+
f"{metric.pretty_name}: {test_score_str} ± {test_se_str}"
|
|
57
|
+
if not np.isnan(test_se)
|
|
58
|
+
else f"{metric.pretty_name}: {test_score_str}"
|
|
59
|
+
)
|
|
60
|
+
logger.info(log_str)
|
|
56
61
|
|
|
57
62
|
return dict(raw=scores, total=total_dict)
|
|
58
63
|
|
|
@@ -84,7 +89,7 @@ def aggregate_scores(
|
|
|
84
89
|
|
|
85
90
|
if len(test_scores) > 1:
|
|
86
91
|
sample_std = np.std(test_scores, ddof=1)
|
|
87
|
-
test_se = sample_std / np.sqrt(len(test_scores))
|
|
92
|
+
test_se = (sample_std / np.sqrt(len(test_scores))).item()
|
|
88
93
|
else:
|
|
89
94
|
test_se = np.nan
|
|
90
95
|
|
euroeval/speed_benchmark.py
CHANGED
|
@@ -59,7 +59,7 @@ def benchmark_speed_single_iteration(
|
|
|
59
59
|
Returns:
|
|
60
60
|
A dictionary containing the scores for the current iteration.
|
|
61
61
|
"""
|
|
62
|
-
|
|
62
|
+
gpt2_tokeniser = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
|
|
63
63
|
|
|
64
64
|
base_doc = "Document which contains roughly 10 tokens. "
|
|
65
65
|
multiplier = 10 * (1 + itr_idx)
|
|
@@ -74,11 +74,11 @@ def benchmark_speed_single_iteration(
|
|
|
74
74
|
model.generate(inputs=dict(text=[doc]))
|
|
75
75
|
|
|
76
76
|
def encoder_predict(doc: str) -> None:
|
|
77
|
-
|
|
77
|
+
tokeniser = model.get_tokeniser()
|
|
78
78
|
pytorch_model = model.get_pytorch_module()
|
|
79
79
|
inputs = {
|
|
80
80
|
key: tensor.to(pytorch_model.device)
|
|
81
|
-
for key, tensor in
|
|
81
|
+
for key, tensor in tokeniser(
|
|
82
82
|
text=[doc], truncation=True, return_tensors="pt"
|
|
83
83
|
).items()
|
|
84
84
|
}
|
|
@@ -102,21 +102,21 @@ def benchmark_speed_single_iteration(
|
|
|
102
102
|
speed_scores = pyinfer.InferenceReport(
|
|
103
103
|
model=predict, inputs=doc, n_seconds=3
|
|
104
104
|
).run(print_report=False)
|
|
105
|
-
num_gpt2_tokens = len(
|
|
105
|
+
num_gpt2_tokens = len(gpt2_tokeniser([doc], truncation=True)["input_ids"][0])
|
|
106
106
|
gpt2_tokens_per_second = speed_scores["Infer(p/sec)"] * num_gpt2_tokens
|
|
107
107
|
|
|
108
108
|
speed_scores_short = pyinfer.InferenceReport(
|
|
109
109
|
model=predict, inputs=short_doc, n_seconds=3
|
|
110
110
|
).run(print_report=False)
|
|
111
111
|
num_gpt2_tokens_short = len(
|
|
112
|
-
|
|
112
|
+
gpt2_tokeniser([short_doc], truncation=True)["input_ids"][0]
|
|
113
113
|
)
|
|
114
114
|
gpt2_tokens_per_second_short = (
|
|
115
115
|
speed_scores_short["Infer(p/sec)"] * num_gpt2_tokens_short
|
|
116
116
|
)
|
|
117
117
|
|
|
118
118
|
except (RuntimeError, ValueError, IndexError) as e:
|
|
119
|
-
raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}")
|
|
119
|
+
raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}") from e
|
|
120
120
|
|
|
121
121
|
return dict(
|
|
122
122
|
test_speed=gpt2_tokens_per_second, test_speed_short=gpt2_tokens_per_second_short
|
|
@@ -94,15 +94,15 @@ class MultipleChoiceClassificationTrainer(Trainer):
|
|
|
94
94
|
|
|
95
95
|
|
|
96
96
|
def prepare_examples(
|
|
97
|
-
examples: "BatchEncoding",
|
|
97
|
+
examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
|
|
98
98
|
) -> "BatchEncoding":
|
|
99
99
|
"""Prepare the features.
|
|
100
100
|
|
|
101
101
|
Args:
|
|
102
102
|
examples:
|
|
103
103
|
The examples to prepare.
|
|
104
|
-
|
|
105
|
-
The
|
|
104
|
+
tokeniser:
|
|
105
|
+
The tokeniser to use to prepare the examples.
|
|
106
106
|
|
|
107
107
|
Returns:
|
|
108
108
|
The prepared examples.
|
|
@@ -110,11 +110,22 @@ def prepare_examples(
|
|
|
110
110
|
doc: str = examples["text"][0]
|
|
111
111
|
sections = doc.split("\n")
|
|
112
112
|
|
|
113
|
-
|
|
113
|
+
candidate_choice_idxs = [
|
|
114
114
|
idx
|
|
115
115
|
for idx, section in enumerate(sections)
|
|
116
|
-
if re.match(pattern=r"^[a-
|
|
116
|
+
if re.match(pattern=r"^[a-z0-9]+\. ", string=section) is not None
|
|
117
117
|
]
|
|
118
|
+
|
|
119
|
+
# Sometimes the question itself starts with a letter or number followed by a dot, We
|
|
120
|
+
# want to ignore these cases, and focus on the final contingent block of at least
|
|
121
|
+
# two choices.
|
|
122
|
+
choice_idxs: list[int] = list()
|
|
123
|
+
for idx in reversed(candidate_choice_idxs):
|
|
124
|
+
if len(choice_idxs) < 2 or (
|
|
125
|
+
len(choice_idxs) >= 2 and idx == choice_idxs[-1] - 1
|
|
126
|
+
):
|
|
127
|
+
choice_idxs.append(idx)
|
|
128
|
+
|
|
118
129
|
choices = [sections[idx] for idx in choice_idxs]
|
|
119
130
|
|
|
120
131
|
# Check that the choices are present, and that all of them are at the end
|
|
@@ -127,7 +138,7 @@ def prepare_examples(
|
|
|
127
138
|
question_idx = min(choice_idxs) - 2 # -2 to remove the 'Choices:' line
|
|
128
139
|
context_and_question = "\n".join(sections[: question_idx + 1]).strip()
|
|
129
140
|
|
|
130
|
-
new_examples =
|
|
141
|
+
new_examples = tokeniser(
|
|
131
142
|
text=[context_and_question] * len(choices),
|
|
132
143
|
text_pair=[choice[3:] for choice in choices],
|
|
133
144
|
padding=True,
|
|
@@ -23,7 +23,7 @@ if t.TYPE_CHECKING:
|
|
|
23
23
|
from transformers.trainer_utils import EvalPrediction
|
|
24
24
|
from transformers.training_args import TrainingArguments
|
|
25
25
|
|
|
26
|
-
from ..data_models import DatasetConfig, GenerativeModelOutput
|
|
26
|
+
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
27
27
|
from ..types import Labels, Predictions
|
|
28
28
|
|
|
29
29
|
logger = logging.getLogger("euroeval")
|
|
@@ -57,7 +57,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
57
57
|
**kwargs,
|
|
58
58
|
)
|
|
59
59
|
|
|
60
|
-
# Get the CLS token id for the
|
|
60
|
+
# Get the CLS token id for the tokeniser
|
|
61
61
|
if self.tokenizer is not None:
|
|
62
62
|
assert isinstance(self.tokenizer, PreTrainedTokenizerBase)
|
|
63
63
|
special_token_metadata = get_special_token_metadata(self.tokenizer)
|
|
@@ -149,6 +149,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
149
149
|
def compute_metrics(
|
|
150
150
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
151
151
|
dataset_config: "DatasetConfig",
|
|
152
|
+
benchmark_config: "BenchmarkConfig",
|
|
152
153
|
dataset: "Dataset",
|
|
153
154
|
) -> dict[str, float]:
|
|
154
155
|
"""Compute the metrics needed for evaluation.
|
|
@@ -159,6 +160,8 @@ def compute_metrics(
|
|
|
159
160
|
contains the true labels.
|
|
160
161
|
dataset_config:
|
|
161
162
|
The configuration of the dataset.
|
|
163
|
+
benchmark_config:
|
|
164
|
+
The configuration of the benchmark.
|
|
162
165
|
dataset:
|
|
163
166
|
The dataset used for evaluation. This is only used in case any additional
|
|
164
167
|
metadata is used to compute the metrics.
|
|
@@ -186,7 +189,11 @@ def compute_metrics(
|
|
|
186
189
|
results: dict[str, float] = dict()
|
|
187
190
|
for metric in dataset_config.task.metrics:
|
|
188
191
|
score: float | None = metric(
|
|
189
|
-
predictions=predictions,
|
|
192
|
+
predictions=predictions,
|
|
193
|
+
references=labels,
|
|
194
|
+
dataset=dataset,
|
|
195
|
+
dataset_config=dataset_config,
|
|
196
|
+
benchmark_config=benchmark_config,
|
|
190
197
|
)
|
|
191
198
|
|
|
192
199
|
# The metric returns None if we are running on multi-GPU and the current
|
|
@@ -221,15 +228,15 @@ def extract_labels_from_generation(
|
|
|
221
228
|
|
|
222
229
|
|
|
223
230
|
def prepare_train_examples(
|
|
224
|
-
examples: "BatchEncoding",
|
|
231
|
+
examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
|
|
225
232
|
) -> "BatchEncoding":
|
|
226
233
|
"""Prepare the features for training.
|
|
227
234
|
|
|
228
235
|
Args:
|
|
229
236
|
examples:
|
|
230
237
|
The examples to prepare.
|
|
231
|
-
|
|
232
|
-
The
|
|
238
|
+
tokeniser:
|
|
239
|
+
The tokeniser to use to prepare the examples.
|
|
233
240
|
|
|
234
241
|
Returns:
|
|
235
242
|
The prepared examples.
|
|
@@ -239,15 +246,15 @@ def prepare_train_examples(
|
|
|
239
246
|
# take a lots of space). So we remove that left whitespace
|
|
240
247
|
examples["question"] = [q.lstrip() for q in examples["question"]]
|
|
241
248
|
|
|
242
|
-
# Extract special token metadata from the
|
|
243
|
-
special_token_metadata = get_special_token_metadata(
|
|
249
|
+
# Extract special token metadata from the tokeniser
|
|
250
|
+
special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
|
|
244
251
|
has_cls_token = special_token_metadata["has_cls_token"]
|
|
245
252
|
has_sep_token = special_token_metadata["has_sep_token"]
|
|
246
253
|
cls_token_id = special_token_metadata["cls_token_id"]
|
|
247
254
|
cls_token = special_token_metadata["cls_token"]
|
|
248
255
|
sep_token = special_token_metadata["sep_token"]
|
|
249
256
|
|
|
250
|
-
# If the
|
|
257
|
+
# If the tokeniser is not adding special tokens, then we add them manually
|
|
251
258
|
if not has_cls_token and not has_sep_token:
|
|
252
259
|
examples["question"] = [
|
|
253
260
|
f"{cls_token}{q}{sep_token}" for q in examples["question"]
|
|
@@ -258,18 +265,18 @@ def prepare_train_examples(
|
|
|
258
265
|
# split into several features. Since we are always keeping the question tokens, we
|
|
259
266
|
# need to make sure that the stride does not exceed the resulting maximum context
|
|
260
267
|
# length.
|
|
261
|
-
max_question_tokens = max(len(
|
|
268
|
+
max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
|
|
262
269
|
num_special_tokens = int(has_cls_token) + int(has_sep_token)
|
|
263
|
-
stride =
|
|
264
|
-
max_length =
|
|
270
|
+
stride = tokeniser.model_max_length // 4
|
|
271
|
+
max_length = tokeniser.model_max_length - stride
|
|
265
272
|
stride = min(stride, max_length - max_question_tokens - num_special_tokens)
|
|
266
|
-
max_length =
|
|
273
|
+
max_length = tokeniser.model_max_length - stride
|
|
267
274
|
|
|
268
275
|
# Tokenize our examples with truncation and padding, but keep the overflows using a
|
|
269
276
|
# stride. This results in one example possible giving several features when a
|
|
270
277
|
# context is long, each of those features having a context that overlaps a bit the
|
|
271
278
|
# context of the previous feature.
|
|
272
|
-
tokenized_examples =
|
|
279
|
+
tokenized_examples = tokeniser(
|
|
273
280
|
text=examples["question"],
|
|
274
281
|
text_pair=examples["context"],
|
|
275
282
|
truncation="only_second",
|
|
@@ -306,9 +313,9 @@ def prepare_train_examples(
|
|
|
306
313
|
sequence_ids = tokenized_examples.sequence_ids(i)
|
|
307
314
|
|
|
308
315
|
# Manually ensure that the special tokens are set to None in `sequence_ids`
|
|
309
|
-
for special_token in
|
|
310
|
-
if hasattr(
|
|
311
|
-
special_token_id = getattr(
|
|
316
|
+
for special_token in tokeniser.special_tokens_map.keys():
|
|
317
|
+
if hasattr(tokeniser, f"{special_token}_id"):
|
|
318
|
+
special_token_id = getattr(tokeniser, f"{special_token}_id")
|
|
312
319
|
if special_token_id is not None:
|
|
313
320
|
sequence_ids = [
|
|
314
321
|
None if token_id == special_token_id else seq_id
|
|
@@ -373,15 +380,15 @@ def prepare_train_examples(
|
|
|
373
380
|
|
|
374
381
|
|
|
375
382
|
def prepare_test_examples(
|
|
376
|
-
examples: "BatchEncoding",
|
|
383
|
+
examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
|
|
377
384
|
) -> "BatchEncoding":
|
|
378
385
|
"""Prepare test examples.
|
|
379
386
|
|
|
380
387
|
Args:
|
|
381
388
|
examples:
|
|
382
389
|
Dictionary of test examples.
|
|
383
|
-
|
|
384
|
-
The
|
|
390
|
+
tokeniser:
|
|
391
|
+
The tokeniser used to preprocess the examples.
|
|
385
392
|
|
|
386
393
|
Returns:
|
|
387
394
|
The prepared test examples.
|
|
@@ -391,14 +398,14 @@ def prepare_test_examples(
|
|
|
391
398
|
# take a lots of space). So we remove that left whitespace
|
|
392
399
|
examples["question"] = [q.lstrip() for q in examples["question"]]
|
|
393
400
|
|
|
394
|
-
# Extract special token metadata from the
|
|
395
|
-
special_token_metadata = get_special_token_metadata(
|
|
401
|
+
# Extract special token metadata from the tokeniser
|
|
402
|
+
special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
|
|
396
403
|
has_cls_token = special_token_metadata["has_cls_token"]
|
|
397
404
|
has_sep_token = special_token_metadata["has_sep_token"]
|
|
398
405
|
cls_token = special_token_metadata["cls_token"]
|
|
399
406
|
sep_token = special_token_metadata["sep_token"]
|
|
400
407
|
|
|
401
|
-
# If the
|
|
408
|
+
# If the tokeniser is not adding special tokens, then we add them manually
|
|
402
409
|
if not has_cls_token and not has_sep_token:
|
|
403
410
|
examples["question"] = [
|
|
404
411
|
f"{cls_token}{q}{sep_token}" for q in examples["question"]
|
|
@@ -409,18 +416,18 @@ def prepare_test_examples(
|
|
|
409
416
|
# split into several features. Since we are always keeping the question tokens, we
|
|
410
417
|
# need to make sure that the stride does not exceed the resulting maximum context
|
|
411
418
|
# length.
|
|
412
|
-
max_question_tokens = max(len(
|
|
419
|
+
max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
|
|
413
420
|
num_special_tokens = int(has_cls_token) + int(has_sep_token)
|
|
414
|
-
stride =
|
|
415
|
-
max_length =
|
|
421
|
+
stride = tokeniser.model_max_length // 4
|
|
422
|
+
max_length = tokeniser.model_max_length - stride
|
|
416
423
|
stride = min(stride, max_length - max_question_tokens - num_special_tokens)
|
|
417
|
-
max_length =
|
|
424
|
+
max_length = tokeniser.model_max_length - stride
|
|
418
425
|
|
|
419
426
|
# Tokenize our examples with truncation and maybe padding, but keep the overflows
|
|
420
427
|
# using a stride. This results in one example possible giving several features when
|
|
421
428
|
# a context is long, each of those features having a context that overlaps a bit
|
|
422
429
|
# the context of the previous feature.
|
|
423
|
-
tokenized_examples =
|
|
430
|
+
tokenized_examples = tokeniser(
|
|
424
431
|
text=examples["question"],
|
|
425
432
|
text_pair=examples["context"],
|
|
426
433
|
truncation="only_second",
|
|
@@ -7,6 +7,7 @@ import typing as t
|
|
|
7
7
|
import Levenshtein
|
|
8
8
|
import numpy as np
|
|
9
9
|
|
|
10
|
+
from ..enums import TaskGroup
|
|
10
11
|
from ..exceptions import InvalidBenchmark
|
|
11
12
|
from ..utils import log_once, raise_if_model_output_contains_nan_values
|
|
12
13
|
|
|
@@ -14,7 +15,7 @@ if t.TYPE_CHECKING:
|
|
|
14
15
|
from datasets.arrow_dataset import Dataset
|
|
15
16
|
from transformers.trainer_utils import EvalPrediction
|
|
16
17
|
|
|
17
|
-
from ..data_models import DatasetConfig, GenerativeModelOutput
|
|
18
|
+
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
18
19
|
from ..types import Labels, Predictions
|
|
19
20
|
|
|
20
21
|
|
|
@@ -24,6 +25,7 @@ logger = logging.getLogger("euroeval")
|
|
|
24
25
|
def compute_metrics(
|
|
25
26
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
26
27
|
dataset_config: "DatasetConfig",
|
|
28
|
+
benchmark_config: "BenchmarkConfig",
|
|
27
29
|
dataset: "Dataset",
|
|
28
30
|
) -> dict[str, float]:
|
|
29
31
|
"""Compute the metrics needed for evaluation.
|
|
@@ -34,6 +36,8 @@ def compute_metrics(
|
|
|
34
36
|
contains the true labels.
|
|
35
37
|
dataset_config:
|
|
36
38
|
The configuration of the dataset.
|
|
39
|
+
benchmark_config:
|
|
40
|
+
The configuration of the benchmark.
|
|
37
41
|
dataset:
|
|
38
42
|
The dataset used for evaluation. This is only used in case any additional
|
|
39
43
|
metadata is used to compute the metrics.
|
|
@@ -79,7 +83,11 @@ def compute_metrics(
|
|
|
79
83
|
results: dict[str, float] = dict()
|
|
80
84
|
for metric in dataset_config.task.metrics:
|
|
81
85
|
score: float | None = metric(
|
|
82
|
-
predictions=predictions,
|
|
86
|
+
predictions=predictions,
|
|
87
|
+
references=label_ids,
|
|
88
|
+
dataset=dataset,
|
|
89
|
+
dataset_config=dataset_config,
|
|
90
|
+
benchmark_config=benchmark_config,
|
|
83
91
|
)
|
|
84
92
|
|
|
85
93
|
# The metric returns None if we are running on multi-GPU and the current
|
|
@@ -113,6 +121,12 @@ def extract_labels_from_generation(
|
|
|
113
121
|
|
|
114
122
|
Returns:
|
|
115
123
|
The predicted labels.
|
|
124
|
+
|
|
125
|
+
Raises:
|
|
126
|
+
InvalidBenchmark:
|
|
127
|
+
If the task requires log probabilities, but the model did not output them,
|
|
128
|
+
or if the model outputted log probabilities but the first label token
|
|
129
|
+
mapping is not provided.
|
|
116
130
|
"""
|
|
117
131
|
if model_output.scores is not None:
|
|
118
132
|
if first_label_token_mapping is False:
|
|
@@ -127,25 +141,74 @@ def extract_labels_from_generation(
|
|
|
127
141
|
)
|
|
128
142
|
if labels is not None:
|
|
129
143
|
return labels
|
|
144
|
+
elif dataset_config.task.requires_logprobs:
|
|
145
|
+
raise InvalidBenchmark(
|
|
146
|
+
"This task requires the model to output logprobs, and this model "
|
|
147
|
+
"does not seem to be able to do that. Skipping the evaluation."
|
|
148
|
+
)
|
|
130
149
|
|
|
150
|
+
# Get the candidate labels, which are the labels that the model can predict
|
|
131
151
|
candidate_labels = [
|
|
132
152
|
dataset_config.prompt_label_mapping[lbl]
|
|
133
153
|
for lbl in dataset_config.id2label.values()
|
|
134
154
|
]
|
|
155
|
+
|
|
135
156
|
new_predicted_labels: list[str] = list()
|
|
136
|
-
for predicted_label in model_output.sequences:
|
|
157
|
+
for idx, predicted_label in enumerate(model_output.sequences):
|
|
158
|
+
# Special case if we are doing multiple choice classification: we in this case
|
|
159
|
+
# dynamically change the candidate labels to the labels mentioned in the prompt
|
|
160
|
+
if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
|
|
161
|
+
prompt = input_batch["text"][idx]
|
|
162
|
+
sample_candidate_labels = [
|
|
163
|
+
candidate_label
|
|
164
|
+
for candidate_label in candidate_labels
|
|
165
|
+
if re.search(
|
|
166
|
+
pattern=rf"\b{candidate_label}. ",
|
|
167
|
+
string=prompt,
|
|
168
|
+
flags=re.IGNORECASE,
|
|
169
|
+
)
|
|
170
|
+
is not None
|
|
171
|
+
]
|
|
172
|
+
else:
|
|
173
|
+
sample_candidate_labels = candidate_labels
|
|
174
|
+
|
|
137
175
|
# If the prediction includes a boxed answer, use that instead of the full
|
|
138
176
|
# generation
|
|
139
177
|
if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
|
|
140
178
|
predicted_label = m.group(1)
|
|
141
179
|
|
|
142
|
-
#
|
|
180
|
+
# We set the word edit distance weights such that we heavily penalise insertions
|
|
181
|
+
# and substitutions, so that we don't just insert the correct label, but that we
|
|
182
|
+
# want the model to have included the correct label in its output.
|
|
183
|
+
insertion_weight = 1000
|
|
184
|
+
deletion_weight = 1
|
|
185
|
+
substitution_weight = 1000
|
|
186
|
+
|
|
187
|
+
# Compute the word edit distances between the predicted label and all candidate
|
|
188
|
+
# labels
|
|
143
189
|
edit_distances = [
|
|
144
|
-
Levenshtein.distance(
|
|
145
|
-
|
|
190
|
+
Levenshtein.distance(
|
|
191
|
+
s1=predicted_label.lower(),
|
|
192
|
+
s2=candidate_label.lower(),
|
|
193
|
+
weights=(insertion_weight, deletion_weight, substitution_weight),
|
|
194
|
+
)
|
|
195
|
+
for candidate_label in sample_candidate_labels
|
|
146
196
|
]
|
|
147
|
-
|
|
148
|
-
|
|
197
|
+
|
|
198
|
+
# If no candidate labels were found, we assume that something is wrong with the
|
|
199
|
+
# model output, and we raise an error
|
|
200
|
+
if min(edit_distances) > 100:
|
|
201
|
+
raise InvalidBenchmark(
|
|
202
|
+
"No candidate labels found for the predicted label "
|
|
203
|
+
f"{predicted_label!r}, out of the candidate labels "
|
|
204
|
+
f"{sample_candidate_labels}. This likely means that the model output "
|
|
205
|
+
"is completely off, and we cannot extract any labels from it. Please "
|
|
206
|
+
"check the model output and the candidate labels."
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Pick the label with the smallest word edit distance to the predicted label
|
|
210
|
+
best_candidate_label = sample_candidate_labels[np.argmin(edit_distances).item()]
|
|
211
|
+
new_predicted_labels.append(best_candidate_label)
|
|
149
212
|
|
|
150
213
|
return new_predicted_labels
|
|
151
214
|
|
|
@@ -187,11 +250,7 @@ def get_closest_logprobs_labels(
|
|
|
187
250
|
for sample in generation_logprobs:
|
|
188
251
|
for logprob_list in sample:
|
|
189
252
|
generated_labels = [
|
|
190
|
-
re.sub(
|
|
191
|
-
pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
|
|
192
|
-
repl="",
|
|
193
|
-
string=label.lower(),
|
|
194
|
-
)
|
|
253
|
+
re.sub(pattern=r"^[^a-zæøåüöä0-9]+$", repl="", string=label.lower())
|
|
195
254
|
for label, _ in logprob_list
|
|
196
255
|
]
|
|
197
256
|
generated_labels = [label for label in generated_labels if label != ""]
|
|
@@ -227,6 +286,18 @@ def get_closest_logprobs_labels(
|
|
|
227
286
|
if candidate_label.startswith(generated_label)
|
|
228
287
|
}
|
|
229
288
|
|
|
289
|
+
# If the generated label is a numeral (e.g., "1", "2", "3") and there is
|
|
290
|
+
# a matching candidate label, we only keep the full match
|
|
291
|
+
if re.match(r"^\d+$", generated_label) and any(
|
|
292
|
+
candidate_label == generated_label
|
|
293
|
+
for candidate_label in candidate_output_labels
|
|
294
|
+
):
|
|
295
|
+
candidate_output_labels = {
|
|
296
|
+
candidate_label
|
|
297
|
+
for candidate_label in candidate_output_labels
|
|
298
|
+
if candidate_label == generated_label
|
|
299
|
+
}
|
|
300
|
+
|
|
230
301
|
# If we can uniquely determine the output label, we break the loop.
|
|
231
302
|
if len(candidate_output_labels) == 1:
|
|
232
303
|
output_label = candidate_output_labels.pop()
|
|
@@ -263,10 +334,12 @@ def get_closest_logprobs_labels(
|
|
|
263
334
|
if candidate_output_labels_starting_with_generated_label:
|
|
264
335
|
log_once(
|
|
265
336
|
f"No candidate label found for the generated label "
|
|
266
|
-
f"{generated_label!r}
|
|
267
|
-
"
|
|
268
|
-
"
|
|
269
|
-
"
|
|
337
|
+
f"{generated_label!r}, but there are candidate labels "
|
|
338
|
+
f"starting with it: "
|
|
339
|
+
f"{candidate_output_labels_starting_with_generated_label}. "
|
|
340
|
+
"This means that the first label token mapping is not "
|
|
341
|
+
"reliable, and we will instead fall back to extracting "
|
|
342
|
+
"the labels using word edit distance.",
|
|
270
343
|
level=logging.DEBUG,
|
|
271
344
|
)
|
|
272
345
|
return None
|
|
@@ -291,16 +364,16 @@ def get_closest_logprobs_labels(
|
|
|
291
364
|
if len(sample) == 0:
|
|
292
365
|
log_once(
|
|
293
366
|
"The model outputted an empty string, so no candidate labels could "
|
|
294
|
-
f"be determined. Using {candidate_labels[0]!r}
|
|
295
|
-
"label.",
|
|
296
|
-
level=logging.
|
|
367
|
+
f"be determined. Using the first label, {candidate_labels[0]!r}, "
|
|
368
|
+
"as the output label.",
|
|
369
|
+
level=logging.INFO,
|
|
297
370
|
)
|
|
298
371
|
else:
|
|
299
372
|
log_once(
|
|
300
373
|
"Could not find a candidate label for any of the generated "
|
|
301
|
-
f"labels in the sample {sample}. Using
|
|
302
|
-
"as the output label.",
|
|
303
|
-
level=logging.
|
|
374
|
+
f"labels in the sample {sample}. Using the first label, "
|
|
375
|
+
f"{candidate_labels[0]!r}, as the output label.",
|
|
376
|
+
level=logging.INFO,
|
|
304
377
|
)
|
|
305
378
|
output_labels.append(candidate_labels[0])
|
|
306
379
|
|
|
@@ -75,7 +75,11 @@ def compute_metrics(
|
|
|
75
75
|
while True:
|
|
76
76
|
try:
|
|
77
77
|
score: float | None = metric(
|
|
78
|
-
predictions=predictions,
|
|
78
|
+
predictions=predictions,
|
|
79
|
+
references=labels,
|
|
80
|
+
dataset=dataset,
|
|
81
|
+
dataset_config=dataset_config,
|
|
82
|
+
benchmark_config=benchmark_config,
|
|
79
83
|
)
|
|
80
84
|
break
|
|
81
85
|
except Exception as e:
|
|
@@ -85,7 +89,7 @@ def compute_metrics(
|
|
|
85
89
|
"MPS backend out of memory",
|
|
86
90
|
]
|
|
87
91
|
if not any(error in str(e) for error in oom_error):
|
|
88
|
-
raise InvalidBenchmark(str(e))
|
|
92
|
+
raise InvalidBenchmark(str(e)) from e
|
|
89
93
|
|
|
90
94
|
if (
|
|
91
95
|
isinstance(metric, HuggingFaceMetric)
|
|
@@ -98,7 +102,7 @@ def compute_metrics(
|
|
|
98
102
|
"the CPU."
|
|
99
103
|
)
|
|
100
104
|
else:
|
|
101
|
-
raise InvalidBenchmark(str(e))
|
|
105
|
+
raise InvalidBenchmark(str(e)) from e
|
|
102
106
|
finally:
|
|
103
107
|
for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
|
|
104
108
|
if hasattr(metric, attribute):
|