EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +8 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +190 -110
- euroeval/benchmark_modules/vllm.py +199 -139
- euroeval/benchmarker.py +49 -22
- euroeval/cli.py +3 -3
- euroeval/constants.py +19 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +73 -23
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +35 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +6 -6
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +90 -20
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +276 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/model_cache.py +13 -1
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +19 -8
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +128 -42
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +59 -73
- euroeval/tasks.py +33 -6
- euroeval/tokenization_utils.py +294 -207
- euroeval/utils.py +150 -35
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
- euroeval-16.0.1.dist-info/RECORD +69 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -470
- euroeval-15.16.0.dist-info/RECORD +0 -63
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
"""Templates for the Summarization task."""
|
|
2
2
|
|
|
3
|
+
import typing as t
|
|
4
|
+
|
|
3
5
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
|
|
6
|
+
from ..languages import DA, DE, EN, ES, ET, FI, FR, IS, IT, LV, NB, NL, NN, NO, PT, SV
|
|
7
|
+
|
|
8
|
+
if t.TYPE_CHECKING:
|
|
9
|
+
from ..data_models import Language
|
|
5
10
|
|
|
6
11
|
# TODO: Missing Faroese
|
|
7
|
-
SUMM_TEMPLATES = {
|
|
12
|
+
SUMM_TEMPLATES: dict["Language", PromptConfig] = {
|
|
8
13
|
DA: PromptConfig(
|
|
9
14
|
default_prompt_prefix="Følgende er dokumenter med tilhørende resuméer.",
|
|
10
15
|
default_prompt_template="Dokument: {text}\nResumé: {target_text}",
|
|
@@ -32,8 +37,14 @@ SUMM_TEMPLATES = {
|
|
|
32
37
|
default_prompt_prefix="A continuación se presentan documentos con resúmenes "
|
|
33
38
|
"adjuntos.",
|
|
34
39
|
default_prompt_template="Documento: {text}\nResumen: {target_text}",
|
|
35
|
-
default_instruction_prompt="Documento: {text}\n\
|
|
36
|
-
|
|
40
|
+
default_instruction_prompt="Documento: {text}\n\n",
|
|
41
|
+
default_prompt_label_mapping=dict(),
|
|
42
|
+
),
|
|
43
|
+
ET: PromptConfig(
|
|
44
|
+
default_prompt_prefix="Allpool on dokumendid koos kokkuvõtetega.",
|
|
45
|
+
default_prompt_template="Dokument: {text}\nKokkuvõte: {target_text}",
|
|
46
|
+
default_instruction_prompt="Dokument: {text}\n\nKoosta ülaltoodud dokumendi "
|
|
47
|
+
"kokkuvõte.",
|
|
37
48
|
default_prompt_label_mapping=dict(),
|
|
38
49
|
),
|
|
39
50
|
PT: PromptConfig(
|
|
@@ -58,6 +69,15 @@ SUMM_TEMPLATES = {
|
|
|
58
69
|
"document ci-dessus.",
|
|
59
70
|
default_prompt_label_mapping=dict(),
|
|
60
71
|
),
|
|
72
|
+
LV: PromptConfig(
|
|
73
|
+
default_prompt_prefix="Tālāk ir dokumenti ar pievienotām kopsavilkumiem.",
|
|
74
|
+
default_prompt_template="Dokuments: {text}\nKopsavilkums: {target_text}",
|
|
75
|
+
default_instruction_prompt=(
|
|
76
|
+
"Dokuments: {text}\n\n"
|
|
77
|
+
"Uzrakstiet kopsavilkumu par iepriekš minēto dokumentu."
|
|
78
|
+
),
|
|
79
|
+
default_prompt_label_mapping=dict(),
|
|
80
|
+
),
|
|
61
81
|
IS: PromptConfig(
|
|
62
82
|
default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
|
|
63
83
|
default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",
|
euroeval/scores.py
CHANGED
|
@@ -52,7 +52,12 @@ def log_scores(
|
|
|
52
52
|
test_se, test_se_str = metric.postprocessing_fn(test_se)
|
|
53
53
|
total_dict[f"test_{metric.name}"] = test_score
|
|
54
54
|
total_dict[f"test_{metric.name}_se"] = test_se
|
|
55
|
-
|
|
55
|
+
log_str = (
|
|
56
|
+
f"{metric.pretty_name}: {test_score_str} ± {test_se_str}"
|
|
57
|
+
if not np.isnan(test_se)
|
|
58
|
+
else f"{metric.pretty_name}: {test_score_str}"
|
|
59
|
+
)
|
|
60
|
+
logger.info(log_str)
|
|
56
61
|
|
|
57
62
|
return dict(raw=scores, total=total_dict)
|
|
58
63
|
|
|
@@ -84,7 +89,7 @@ def aggregate_scores(
|
|
|
84
89
|
|
|
85
90
|
if len(test_scores) > 1:
|
|
86
91
|
sample_std = np.std(test_scores, ddof=1)
|
|
87
|
-
test_se = sample_std / np.sqrt(len(test_scores))
|
|
92
|
+
test_se = (sample_std / np.sqrt(len(test_scores))).item()
|
|
88
93
|
else:
|
|
89
94
|
test_se = np.nan
|
|
90
95
|
|
euroeval/speed_benchmark.py
CHANGED
|
@@ -59,7 +59,7 @@ def benchmark_speed_single_iteration(
|
|
|
59
59
|
Returns:
|
|
60
60
|
A dictionary containing the scores for the current iteration.
|
|
61
61
|
"""
|
|
62
|
-
|
|
62
|
+
gpt2_tokeniser = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
|
|
63
63
|
|
|
64
64
|
base_doc = "Document which contains roughly 10 tokens. "
|
|
65
65
|
multiplier = 10 * (1 + itr_idx)
|
|
@@ -74,11 +74,11 @@ def benchmark_speed_single_iteration(
|
|
|
74
74
|
model.generate(inputs=dict(text=[doc]))
|
|
75
75
|
|
|
76
76
|
def encoder_predict(doc: str) -> None:
|
|
77
|
-
|
|
77
|
+
tokeniser = model.get_tokeniser()
|
|
78
78
|
pytorch_model = model.get_pytorch_module()
|
|
79
79
|
inputs = {
|
|
80
80
|
key: tensor.to(pytorch_model.device)
|
|
81
|
-
for key, tensor in
|
|
81
|
+
for key, tensor in tokeniser(
|
|
82
82
|
text=[doc], truncation=True, return_tensors="pt"
|
|
83
83
|
).items()
|
|
84
84
|
}
|
|
@@ -102,21 +102,21 @@ def benchmark_speed_single_iteration(
|
|
|
102
102
|
speed_scores = pyinfer.InferenceReport(
|
|
103
103
|
model=predict, inputs=doc, n_seconds=3
|
|
104
104
|
).run(print_report=False)
|
|
105
|
-
num_gpt2_tokens = len(
|
|
105
|
+
num_gpt2_tokens = len(gpt2_tokeniser([doc], truncation=True)["input_ids"][0])
|
|
106
106
|
gpt2_tokens_per_second = speed_scores["Infer(p/sec)"] * num_gpt2_tokens
|
|
107
107
|
|
|
108
108
|
speed_scores_short = pyinfer.InferenceReport(
|
|
109
109
|
model=predict, inputs=short_doc, n_seconds=3
|
|
110
110
|
).run(print_report=False)
|
|
111
111
|
num_gpt2_tokens_short = len(
|
|
112
|
-
|
|
112
|
+
gpt2_tokeniser([short_doc], truncation=True)["input_ids"][0]
|
|
113
113
|
)
|
|
114
114
|
gpt2_tokens_per_second_short = (
|
|
115
115
|
speed_scores_short["Infer(p/sec)"] * num_gpt2_tokens_short
|
|
116
116
|
)
|
|
117
117
|
|
|
118
118
|
except (RuntimeError, ValueError, IndexError) as e:
|
|
119
|
-
raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}")
|
|
119
|
+
raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}") from e
|
|
120
120
|
|
|
121
121
|
return dict(
|
|
122
122
|
test_speed=gpt2_tokens_per_second, test_speed_short=gpt2_tokens_per_second_short
|
|
@@ -94,15 +94,15 @@ class MultipleChoiceClassificationTrainer(Trainer):
|
|
|
94
94
|
|
|
95
95
|
|
|
96
96
|
def prepare_examples(
|
|
97
|
-
examples: "BatchEncoding",
|
|
97
|
+
examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
|
|
98
98
|
) -> "BatchEncoding":
|
|
99
99
|
"""Prepare the features.
|
|
100
100
|
|
|
101
101
|
Args:
|
|
102
102
|
examples:
|
|
103
103
|
The examples to prepare.
|
|
104
|
-
|
|
105
|
-
The
|
|
104
|
+
tokeniser:
|
|
105
|
+
The tokeniser to use to prepare the examples.
|
|
106
106
|
|
|
107
107
|
Returns:
|
|
108
108
|
The prepared examples.
|
|
@@ -110,12 +110,23 @@ def prepare_examples(
|
|
|
110
110
|
doc: str = examples["text"][0]
|
|
111
111
|
sections = doc.split("\n")
|
|
112
112
|
|
|
113
|
-
|
|
113
|
+
candidate_choice_idxs = [
|
|
114
114
|
idx
|
|
115
115
|
for idx, section in enumerate(sections)
|
|
116
|
-
if re.match(pattern=r"^[a-
|
|
116
|
+
if re.match(pattern=r"^[a-z0-9]+\. ", string=section) is not None
|
|
117
117
|
]
|
|
118
|
-
|
|
118
|
+
|
|
119
|
+
# Sometimes the question itself starts with a letter or number followed by a dot, We
|
|
120
|
+
# want to ignore these cases, and focus on the final contingent block of at least
|
|
121
|
+
# two choices.
|
|
122
|
+
choice_idxs: list[int] = list()
|
|
123
|
+
for idx in reversed(candidate_choice_idxs):
|
|
124
|
+
if len(choice_idxs) < 2 or (
|
|
125
|
+
len(choice_idxs) >= 2 and idx == choice_idxs[-1] - 1
|
|
126
|
+
):
|
|
127
|
+
choice_idxs.append(idx)
|
|
128
|
+
|
|
129
|
+
choices = [sections[idx] for idx in reversed(choice_idxs)]
|
|
119
130
|
|
|
120
131
|
# Check that the choices are present, and that all of them are at the end
|
|
121
132
|
assert len(choices) > 0, "No choices found in the document."
|
|
@@ -127,7 +138,7 @@ def prepare_examples(
|
|
|
127
138
|
question_idx = min(choice_idxs) - 2 # -2 to remove the 'Choices:' line
|
|
128
139
|
context_and_question = "\n".join(sections[: question_idx + 1]).strip()
|
|
129
140
|
|
|
130
|
-
new_examples =
|
|
141
|
+
new_examples = tokeniser(
|
|
131
142
|
text=[context_and_question] * len(choices),
|
|
132
143
|
text_pair=[choice[3:] for choice in choices],
|
|
133
144
|
padding=True,
|
|
@@ -135,7 +146,7 @@ def prepare_examples(
|
|
|
135
146
|
)
|
|
136
147
|
new_examples["label"] = [
|
|
137
148
|
int(choice.startswith(f"{letter}. ") and letter == examples["label"][0])
|
|
138
|
-
for letter, choice in zip("
|
|
149
|
+
for letter, choice in zip("abcdefghijklmnopqrstuvwxyz", choices)
|
|
139
150
|
]
|
|
140
151
|
new_examples["id"] = [hashlib.md5(string=doc.encode()).hexdigest()] * len(choices)
|
|
141
152
|
return new_examples
|
|
@@ -23,7 +23,7 @@ if t.TYPE_CHECKING:
|
|
|
23
23
|
from transformers.trainer_utils import EvalPrediction
|
|
24
24
|
from transformers.training_args import TrainingArguments
|
|
25
25
|
|
|
26
|
-
from ..data_models import DatasetConfig, GenerativeModelOutput
|
|
26
|
+
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
27
27
|
from ..types import Labels, Predictions
|
|
28
28
|
|
|
29
29
|
logger = logging.getLogger("euroeval")
|
|
@@ -57,7 +57,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
57
57
|
**kwargs,
|
|
58
58
|
)
|
|
59
59
|
|
|
60
|
-
# Get the CLS token id for the
|
|
60
|
+
# Get the CLS token id for the tokeniser
|
|
61
61
|
if self.tokenizer is not None:
|
|
62
62
|
assert isinstance(self.tokenizer, PreTrainedTokenizerBase)
|
|
63
63
|
special_token_metadata = get_special_token_metadata(self.tokenizer)
|
|
@@ -149,6 +149,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
149
149
|
def compute_metrics(
|
|
150
150
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
151
151
|
dataset_config: "DatasetConfig",
|
|
152
|
+
benchmark_config: "BenchmarkConfig",
|
|
152
153
|
dataset: "Dataset",
|
|
153
154
|
) -> dict[str, float]:
|
|
154
155
|
"""Compute the metrics needed for evaluation.
|
|
@@ -159,6 +160,8 @@ def compute_metrics(
|
|
|
159
160
|
contains the true labels.
|
|
160
161
|
dataset_config:
|
|
161
162
|
The configuration of the dataset.
|
|
163
|
+
benchmark_config:
|
|
164
|
+
The configuration of the benchmark.
|
|
162
165
|
dataset:
|
|
163
166
|
The dataset used for evaluation. This is only used in case any additional
|
|
164
167
|
metadata is used to compute the metrics.
|
|
@@ -186,7 +189,11 @@ def compute_metrics(
|
|
|
186
189
|
results: dict[str, float] = dict()
|
|
187
190
|
for metric in dataset_config.task.metrics:
|
|
188
191
|
score: float | None = metric(
|
|
189
|
-
predictions=predictions,
|
|
192
|
+
predictions=predictions,
|
|
193
|
+
references=labels,
|
|
194
|
+
dataset=dataset,
|
|
195
|
+
dataset_config=dataset_config,
|
|
196
|
+
benchmark_config=benchmark_config,
|
|
190
197
|
)
|
|
191
198
|
|
|
192
199
|
# The metric returns None if we are running on multi-GPU and the current
|
|
@@ -221,15 +228,15 @@ def extract_labels_from_generation(
|
|
|
221
228
|
|
|
222
229
|
|
|
223
230
|
def prepare_train_examples(
|
|
224
|
-
examples: "BatchEncoding",
|
|
231
|
+
examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
|
|
225
232
|
) -> "BatchEncoding":
|
|
226
233
|
"""Prepare the features for training.
|
|
227
234
|
|
|
228
235
|
Args:
|
|
229
236
|
examples:
|
|
230
237
|
The examples to prepare.
|
|
231
|
-
|
|
232
|
-
The
|
|
238
|
+
tokeniser:
|
|
239
|
+
The tokeniser to use to prepare the examples.
|
|
233
240
|
|
|
234
241
|
Returns:
|
|
235
242
|
The prepared examples.
|
|
@@ -239,15 +246,15 @@ def prepare_train_examples(
|
|
|
239
246
|
# take a lots of space). So we remove that left whitespace
|
|
240
247
|
examples["question"] = [q.lstrip() for q in examples["question"]]
|
|
241
248
|
|
|
242
|
-
# Extract special token metadata from the
|
|
243
|
-
special_token_metadata = get_special_token_metadata(
|
|
249
|
+
# Extract special token metadata from the tokeniser
|
|
250
|
+
special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
|
|
244
251
|
has_cls_token = special_token_metadata["has_cls_token"]
|
|
245
252
|
has_sep_token = special_token_metadata["has_sep_token"]
|
|
246
253
|
cls_token_id = special_token_metadata["cls_token_id"]
|
|
247
254
|
cls_token = special_token_metadata["cls_token"]
|
|
248
255
|
sep_token = special_token_metadata["sep_token"]
|
|
249
256
|
|
|
250
|
-
# If the
|
|
257
|
+
# If the tokeniser is not adding special tokens, then we add them manually
|
|
251
258
|
if not has_cls_token and not has_sep_token:
|
|
252
259
|
examples["question"] = [
|
|
253
260
|
f"{cls_token}{q}{sep_token}" for q in examples["question"]
|
|
@@ -258,18 +265,18 @@ def prepare_train_examples(
|
|
|
258
265
|
# split into several features. Since we are always keeping the question tokens, we
|
|
259
266
|
# need to make sure that the stride does not exceed the resulting maximum context
|
|
260
267
|
# length.
|
|
261
|
-
max_question_tokens = max(len(
|
|
268
|
+
max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
|
|
262
269
|
num_special_tokens = int(has_cls_token) + int(has_sep_token)
|
|
263
|
-
stride =
|
|
264
|
-
max_length =
|
|
270
|
+
stride = tokeniser.model_max_length // 4
|
|
271
|
+
max_length = tokeniser.model_max_length - stride
|
|
265
272
|
stride = min(stride, max_length - max_question_tokens - num_special_tokens)
|
|
266
|
-
max_length =
|
|
273
|
+
max_length = tokeniser.model_max_length - stride
|
|
267
274
|
|
|
268
275
|
# Tokenize our examples with truncation and padding, but keep the overflows using a
|
|
269
276
|
# stride. This results in one example possible giving several features when a
|
|
270
277
|
# context is long, each of those features having a context that overlaps a bit the
|
|
271
278
|
# context of the previous feature.
|
|
272
|
-
tokenized_examples =
|
|
279
|
+
tokenized_examples = tokeniser(
|
|
273
280
|
text=examples["question"],
|
|
274
281
|
text_pair=examples["context"],
|
|
275
282
|
truncation="only_second",
|
|
@@ -306,9 +313,9 @@ def prepare_train_examples(
|
|
|
306
313
|
sequence_ids = tokenized_examples.sequence_ids(i)
|
|
307
314
|
|
|
308
315
|
# Manually ensure that the special tokens are set to None in `sequence_ids`
|
|
309
|
-
for special_token in
|
|
310
|
-
if hasattr(
|
|
311
|
-
special_token_id = getattr(
|
|
316
|
+
for special_token in tokeniser.special_tokens_map.keys():
|
|
317
|
+
if hasattr(tokeniser, f"{special_token}_id"):
|
|
318
|
+
special_token_id = getattr(tokeniser, f"{special_token}_id")
|
|
312
319
|
if special_token_id is not None:
|
|
313
320
|
sequence_ids = [
|
|
314
321
|
None if token_id == special_token_id else seq_id
|
|
@@ -373,15 +380,15 @@ def prepare_train_examples(
|
|
|
373
380
|
|
|
374
381
|
|
|
375
382
|
def prepare_test_examples(
|
|
376
|
-
examples: "BatchEncoding",
|
|
383
|
+
examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
|
|
377
384
|
) -> "BatchEncoding":
|
|
378
385
|
"""Prepare test examples.
|
|
379
386
|
|
|
380
387
|
Args:
|
|
381
388
|
examples:
|
|
382
389
|
Dictionary of test examples.
|
|
383
|
-
|
|
384
|
-
The
|
|
390
|
+
tokeniser:
|
|
391
|
+
The tokeniser used to preprocess the examples.
|
|
385
392
|
|
|
386
393
|
Returns:
|
|
387
394
|
The prepared test examples.
|
|
@@ -391,14 +398,14 @@ def prepare_test_examples(
|
|
|
391
398
|
# take a lots of space). So we remove that left whitespace
|
|
392
399
|
examples["question"] = [q.lstrip() for q in examples["question"]]
|
|
393
400
|
|
|
394
|
-
# Extract special token metadata from the
|
|
395
|
-
special_token_metadata = get_special_token_metadata(
|
|
401
|
+
# Extract special token metadata from the tokeniser
|
|
402
|
+
special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
|
|
396
403
|
has_cls_token = special_token_metadata["has_cls_token"]
|
|
397
404
|
has_sep_token = special_token_metadata["has_sep_token"]
|
|
398
405
|
cls_token = special_token_metadata["cls_token"]
|
|
399
406
|
sep_token = special_token_metadata["sep_token"]
|
|
400
407
|
|
|
401
|
-
# If the
|
|
408
|
+
# If the tokeniser is not adding special tokens, then we add them manually
|
|
402
409
|
if not has_cls_token and not has_sep_token:
|
|
403
410
|
examples["question"] = [
|
|
404
411
|
f"{cls_token}{q}{sep_token}" for q in examples["question"]
|
|
@@ -409,18 +416,18 @@ def prepare_test_examples(
|
|
|
409
416
|
# split into several features. Since we are always keeping the question tokens, we
|
|
410
417
|
# need to make sure that the stride does not exceed the resulting maximum context
|
|
411
418
|
# length.
|
|
412
|
-
max_question_tokens = max(len(
|
|
419
|
+
max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
|
|
413
420
|
num_special_tokens = int(has_cls_token) + int(has_sep_token)
|
|
414
|
-
stride =
|
|
415
|
-
max_length =
|
|
421
|
+
stride = tokeniser.model_max_length // 4
|
|
422
|
+
max_length = tokeniser.model_max_length - stride
|
|
416
423
|
stride = min(stride, max_length - max_question_tokens - num_special_tokens)
|
|
417
|
-
max_length =
|
|
424
|
+
max_length = tokeniser.model_max_length - stride
|
|
418
425
|
|
|
419
426
|
# Tokenize our examples with truncation and maybe padding, but keep the overflows
|
|
420
427
|
# using a stride. This results in one example possible giving several features when
|
|
421
428
|
# a context is long, each of those features having a context that overlaps a bit
|
|
422
429
|
# the context of the previous feature.
|
|
423
|
-
tokenized_examples =
|
|
430
|
+
tokenized_examples = tokeniser(
|
|
424
431
|
text=examples["question"],
|
|
425
432
|
text_pair=examples["context"],
|
|
426
433
|
truncation="only_second",
|
|
@@ -7,14 +7,19 @@ import typing as t
|
|
|
7
7
|
import Levenshtein
|
|
8
8
|
import numpy as np
|
|
9
9
|
|
|
10
|
+
from ..enums import TaskGroup
|
|
10
11
|
from ..exceptions import InvalidBenchmark
|
|
11
|
-
from ..utils import
|
|
12
|
+
from ..utils import (
|
|
13
|
+
extract_multiple_choice_labels,
|
|
14
|
+
log_once,
|
|
15
|
+
raise_if_model_output_contains_nan_values,
|
|
16
|
+
)
|
|
12
17
|
|
|
13
18
|
if t.TYPE_CHECKING:
|
|
14
19
|
from datasets.arrow_dataset import Dataset
|
|
15
20
|
from transformers.trainer_utils import EvalPrediction
|
|
16
21
|
|
|
17
|
-
from ..data_models import DatasetConfig, GenerativeModelOutput
|
|
22
|
+
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
18
23
|
from ..types import Labels, Predictions
|
|
19
24
|
|
|
20
25
|
|
|
@@ -24,6 +29,7 @@ logger = logging.getLogger("euroeval")
|
|
|
24
29
|
def compute_metrics(
|
|
25
30
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
26
31
|
dataset_config: "DatasetConfig",
|
|
32
|
+
benchmark_config: "BenchmarkConfig",
|
|
27
33
|
dataset: "Dataset",
|
|
28
34
|
) -> dict[str, float]:
|
|
29
35
|
"""Compute the metrics needed for evaluation.
|
|
@@ -34,6 +40,8 @@ def compute_metrics(
|
|
|
34
40
|
contains the true labels.
|
|
35
41
|
dataset_config:
|
|
36
42
|
The configuration of the dataset.
|
|
43
|
+
benchmark_config:
|
|
44
|
+
The configuration of the benchmark.
|
|
37
45
|
dataset:
|
|
38
46
|
The dataset used for evaluation. This is only used in case any additional
|
|
39
47
|
metadata is used to compute the metrics.
|
|
@@ -79,7 +87,11 @@ def compute_metrics(
|
|
|
79
87
|
results: dict[str, float] = dict()
|
|
80
88
|
for metric in dataset_config.task.metrics:
|
|
81
89
|
score: float | None = metric(
|
|
82
|
-
predictions=predictions,
|
|
90
|
+
predictions=predictions,
|
|
91
|
+
references=label_ids,
|
|
92
|
+
dataset=dataset,
|
|
93
|
+
dataset_config=dataset_config,
|
|
94
|
+
benchmark_config=benchmark_config,
|
|
83
95
|
)
|
|
84
96
|
|
|
85
97
|
# The metric returns None if we are running on multi-GPU and the current
|
|
@@ -113,7 +125,28 @@ def extract_labels_from_generation(
|
|
|
113
125
|
|
|
114
126
|
Returns:
|
|
115
127
|
The predicted labels.
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
InvalidBenchmark:
|
|
131
|
+
If the task requires log probabilities, but the model did not output them,
|
|
132
|
+
or if the model outputted log probabilities but the first label token
|
|
133
|
+
mapping is not provided.
|
|
116
134
|
"""
|
|
135
|
+
# Get the candidate labels, which are the labels that the model can predict
|
|
136
|
+
default_labels = [
|
|
137
|
+
dataset_config.prompt_label_mapping[lbl]
|
|
138
|
+
for lbl in dataset_config.id2label.values()
|
|
139
|
+
]
|
|
140
|
+
if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
|
|
141
|
+
sample_candidate_labels = [
|
|
142
|
+
extract_multiple_choice_labels(
|
|
143
|
+
prompt=prompt, candidate_labels=default_labels
|
|
144
|
+
)
|
|
145
|
+
for prompt in input_batch["prompt"]
|
|
146
|
+
]
|
|
147
|
+
else:
|
|
148
|
+
sample_candidate_labels = [default_labels] * len(input_batch["prompt"])
|
|
149
|
+
|
|
117
150
|
if model_output.scores is not None:
|
|
118
151
|
if first_label_token_mapping is False:
|
|
119
152
|
raise InvalidBenchmark(
|
|
@@ -122,38 +155,85 @@ def extract_labels_from_generation(
|
|
|
122
155
|
)
|
|
123
156
|
labels = get_closest_logprobs_labels(
|
|
124
157
|
generation_logprobs=model_output.scores,
|
|
125
|
-
dataset_config=dataset_config,
|
|
126
158
|
first_label_token_mapping=first_label_token_mapping,
|
|
159
|
+
candidate_labels=sample_candidate_labels,
|
|
127
160
|
)
|
|
128
161
|
if labels is not None:
|
|
129
162
|
return labels
|
|
163
|
+
elif dataset_config.task.requires_logprobs:
|
|
164
|
+
raise InvalidBenchmark(
|
|
165
|
+
"This task requires the model to output logprobs, and this model "
|
|
166
|
+
"does not seem to be able to do that. Skipping the evaluation."
|
|
167
|
+
)
|
|
130
168
|
|
|
131
|
-
candidate_labels = [
|
|
132
|
-
dataset_config.prompt_label_mapping[lbl]
|
|
133
|
-
for lbl in dataset_config.id2label.values()
|
|
134
|
-
]
|
|
135
169
|
new_predicted_labels: list[str] = list()
|
|
136
|
-
for predicted_label in model_output.sequences:
|
|
170
|
+
for idx, predicted_label in enumerate(model_output.sequences):
|
|
137
171
|
# If the prediction includes a boxed answer, use that instead of the full
|
|
138
172
|
# generation
|
|
139
173
|
if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
|
|
140
174
|
predicted_label = m.group(1)
|
|
141
175
|
|
|
142
|
-
#
|
|
176
|
+
# We set the word edit distance weights such that we heavily penalise insertions
|
|
177
|
+
# and substitutions, so that we don't just insert the correct label, but that we
|
|
178
|
+
# want the model to have included the correct label in its output.
|
|
179
|
+
insertion_weight = 1000
|
|
180
|
+
deletion_weight = 1
|
|
181
|
+
substitution_weight = 1000
|
|
182
|
+
|
|
183
|
+
# Compute the word edit distances between the predicted label and all candidate
|
|
184
|
+
# labels
|
|
143
185
|
edit_distances = [
|
|
144
|
-
Levenshtein.distance(
|
|
145
|
-
|
|
186
|
+
Levenshtein.distance(
|
|
187
|
+
s1=predicted_label.lower(),
|
|
188
|
+
s2=candidate_label.lower(),
|
|
189
|
+
weights=(insertion_weight, deletion_weight, substitution_weight),
|
|
190
|
+
)
|
|
191
|
+
for candidate_label in sample_candidate_labels[idx]
|
|
146
192
|
]
|
|
147
|
-
|
|
148
|
-
|
|
193
|
+
|
|
194
|
+
best_candidate_label = sample_candidate_labels[idx][
|
|
195
|
+
np.argmin(edit_distances).item()
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
# If no candidate labels were found, we either pick the label with the smallest
|
|
199
|
+
# word edit distance to the predicted label (if invalid model outputs are
|
|
200
|
+
# allowed), or we raise an error
|
|
201
|
+
if min(edit_distances) > 100:
|
|
202
|
+
if dataset_config.task.allow_invalid_model_outputs:
|
|
203
|
+
logger.warning(
|
|
204
|
+
"No candidate labels found for the predicted label "
|
|
205
|
+
f"{predicted_label!r}, out of the candidate labels "
|
|
206
|
+
f"{sample_candidate_labels[idx]}. This likely means that the model "
|
|
207
|
+
"output is completely off, but since invalid model outputs are "
|
|
208
|
+
"allowed for this task, we will use the closest candidate label "
|
|
209
|
+
f"({best_candidate_label})) as the output label. If you see this "
|
|
210
|
+
"warning very often, please report this issue to the EuroEval "
|
|
211
|
+
"team at github.com/EuroEval/EuroEval/issues."
|
|
212
|
+
)
|
|
213
|
+
logger.debug(
|
|
214
|
+
"The candidate labels were extracted from the prompt: "
|
|
215
|
+
f"{input_batch['text'][idx]!r}."
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
raise InvalidBenchmark(
|
|
219
|
+
"No candidate labels found for the predicted label "
|
|
220
|
+
f"{predicted_label!r}, out of the candidate labels "
|
|
221
|
+
f"{sample_candidate_labels[idx]}. This likely means that the model "
|
|
222
|
+
"output is completely off, and we cannot extract any labels from "
|
|
223
|
+
"it. Please check the model output and the candidate labels. The "
|
|
224
|
+
"candidate labels were extracted from the prompt: "
|
|
225
|
+
f"{input_batch['text'][idx]!r}."
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
new_predicted_labels.append(best_candidate_label)
|
|
149
229
|
|
|
150
230
|
return new_predicted_labels
|
|
151
231
|
|
|
152
232
|
|
|
153
233
|
def get_closest_logprobs_labels(
|
|
154
234
|
generation_logprobs: list[list[list[tuple[str, float]]]],
|
|
155
|
-
dataset_config: "DatasetConfig",
|
|
156
235
|
first_label_token_mapping: dict[str, str] | t.Literal[True],
|
|
236
|
+
candidate_labels: list[list[str]],
|
|
157
237
|
) -> list[str] | None:
|
|
158
238
|
"""Get the labels with the highest predicted logprob value.
|
|
159
239
|
|
|
@@ -166,11 +246,11 @@ def get_closest_logprobs_labels(
|
|
|
166
246
|
generation_logprobs:
|
|
167
247
|
The logprobs of the generated tokens, for all samples in the batch. Of shape
|
|
168
248
|
(batch_size, num_tokens, num_logprobs).
|
|
169
|
-
dataset_config:
|
|
170
|
-
The configuration of the dataset.
|
|
171
249
|
first_label_token_mapping:
|
|
172
250
|
A mapping from labels to the first token in each label, or alternatively a
|
|
173
251
|
`True` value indicating that the model should output logprobs.
|
|
252
|
+
candidate_labels:
|
|
253
|
+
The candidate labels for each sample in the batch.
|
|
174
254
|
|
|
175
255
|
Returns:
|
|
176
256
|
The predicted labels, or None if labels could not be extracted.
|
|
@@ -179,19 +259,11 @@ def get_closest_logprobs_labels(
|
|
|
179
259
|
InvalidBenchmark:
|
|
180
260
|
If no candidate label can be found for any of the generated labels.
|
|
181
261
|
"""
|
|
182
|
-
english_labels = list(dataset_config.id2label.values())
|
|
183
|
-
english2local = dataset_config.prompt_label_mapping
|
|
184
|
-
candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
|
|
185
|
-
|
|
186
262
|
output_labels: list[str] = list()
|
|
187
|
-
for sample in generation_logprobs:
|
|
263
|
+
for idx, sample in enumerate(generation_logprobs):
|
|
188
264
|
for logprob_list in sample:
|
|
189
265
|
generated_labels = [
|
|
190
|
-
re.sub(
|
|
191
|
-
pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
|
|
192
|
-
repl="",
|
|
193
|
-
string=label.lower(),
|
|
194
|
-
)
|
|
266
|
+
re.sub(pattern=r"^[^a-zæøåüöä0-9]+$", repl="", string=label.lower())
|
|
195
267
|
for label, _ in logprob_list
|
|
196
268
|
]
|
|
197
269
|
generated_labels = [label for label in generated_labels if label != ""]
|
|
@@ -206,7 +278,7 @@ def get_closest_logprobs_labels(
|
|
|
206
278
|
if isinstance(first_label_token_mapping, dict):
|
|
207
279
|
if any(
|
|
208
280
|
candidate_label not in first_label_token_mapping
|
|
209
|
-
for candidate_label in candidate_labels
|
|
281
|
+
for candidate_label in candidate_labels[idx]
|
|
210
282
|
):
|
|
211
283
|
raise InvalidBenchmark(
|
|
212
284
|
"There is a label not present in the first label token "
|
|
@@ -217,16 +289,28 @@ def get_closest_logprobs_labels(
|
|
|
217
289
|
|
|
218
290
|
candidate_output_labels = {
|
|
219
291
|
candidate_label
|
|
220
|
-
for candidate_label in candidate_labels
|
|
292
|
+
for candidate_label in candidate_labels[idx]
|
|
221
293
|
if generated_label == first_label_token_mapping[candidate_label]
|
|
222
294
|
}
|
|
223
295
|
else:
|
|
224
296
|
candidate_output_labels = {
|
|
225
297
|
candidate_label
|
|
226
|
-
for candidate_label in candidate_labels
|
|
298
|
+
for candidate_label in candidate_labels[idx]
|
|
227
299
|
if candidate_label.startswith(generated_label)
|
|
228
300
|
}
|
|
229
301
|
|
|
302
|
+
# If the generated label is a numeral (e.g., "1", "2", "3") and there is
|
|
303
|
+
# a matching candidate label, we only keep the full match
|
|
304
|
+
if re.match(r"^\d+$", generated_label) and any(
|
|
305
|
+
candidate_label == generated_label
|
|
306
|
+
for candidate_label in candidate_output_labels
|
|
307
|
+
):
|
|
308
|
+
candidate_output_labels = {
|
|
309
|
+
candidate_label
|
|
310
|
+
for candidate_label in candidate_output_labels
|
|
311
|
+
if candidate_label == generated_label
|
|
312
|
+
}
|
|
313
|
+
|
|
230
314
|
# If we can uniquely determine the output label, we break the loop.
|
|
231
315
|
if len(candidate_output_labels) == 1:
|
|
232
316
|
output_label = candidate_output_labels.pop()
|
|
@@ -257,16 +341,18 @@ def get_closest_logprobs_labels(
|
|
|
257
341
|
elif len(candidate_output_labels) == 0:
|
|
258
342
|
candidate_output_labels_starting_with_generated_label = [
|
|
259
343
|
candidate_label
|
|
260
|
-
for candidate_label in candidate_labels
|
|
344
|
+
for candidate_label in candidate_labels[idx]
|
|
261
345
|
if candidate_label.startswith(generated_label)
|
|
262
346
|
]
|
|
263
347
|
if candidate_output_labels_starting_with_generated_label:
|
|
264
348
|
log_once(
|
|
265
349
|
f"No candidate label found for the generated label "
|
|
266
|
-
f"{generated_label!r}
|
|
267
|
-
"
|
|
268
|
-
"
|
|
269
|
-
"
|
|
350
|
+
f"{generated_label!r}, but there are candidate labels "
|
|
351
|
+
f"starting with it: "
|
|
352
|
+
f"{candidate_output_labels_starting_with_generated_label}. "
|
|
353
|
+
"This means that the first label token mapping is not "
|
|
354
|
+
"reliable, and we will instead fall back to extracting "
|
|
355
|
+
"the labels using word edit distance.",
|
|
270
356
|
level=logging.DEBUG,
|
|
271
357
|
)
|
|
272
358
|
return None
|
|
@@ -291,18 +377,18 @@ def get_closest_logprobs_labels(
|
|
|
291
377
|
if len(sample) == 0:
|
|
292
378
|
log_once(
|
|
293
379
|
"The model outputted an empty string, so no candidate labels could "
|
|
294
|
-
|
|
295
|
-
"label.",
|
|
296
|
-
level=logging.
|
|
380
|
+
"be determined. Using the first label, "
|
|
381
|
+
f"{candidate_labels[idx][0]!r}, as the output label.",
|
|
382
|
+
level=logging.INFO,
|
|
297
383
|
)
|
|
298
384
|
else:
|
|
299
385
|
log_once(
|
|
300
386
|
"Could not find a candidate label for any of the generated "
|
|
301
|
-
f"labels in the sample {sample}. Using
|
|
302
|
-
"as the output label.",
|
|
303
|
-
level=logging.
|
|
387
|
+
f"labels in the sample {sample}. Using the first label, "
|
|
388
|
+
f"{candidate_labels[idx][0]!r}, as the output label.",
|
|
389
|
+
level=logging.INFO,
|
|
304
390
|
)
|
|
305
|
-
output_labels.append(candidate_labels[0])
|
|
391
|
+
output_labels.append(candidate_labels[idx][0])
|
|
306
392
|
|
|
307
393
|
assert len(output_labels) == len(generation_logprobs)
|
|
308
394
|
return output_labels
|