EuroEval 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_config_factory.py +6 -1
- euroeval/benchmark_modules/base.py +2 -0
- euroeval/benchmark_modules/fresh.py +7 -1
- euroeval/benchmark_modules/hf.py +26 -21
- euroeval/benchmark_modules/litellm.py +258 -131
- euroeval/benchmark_modules/vllm.py +79 -40
- euroeval/benchmarker.py +11 -2
- euroeval/cli.py +14 -1
- euroeval/constants.py +1 -1
- euroeval/data_models.py +77 -6
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/danish.py +14 -0
- euroeval/dataset_configs/dutch.py +14 -0
- euroeval/dataset_configs/english.py +22 -0
- euroeval/dataset_configs/estonian.py +15 -7
- euroeval/dataset_configs/finnish.py +14 -0
- euroeval/dataset_configs/french.py +14 -0
- euroeval/dataset_configs/german.py +23 -0
- euroeval/dataset_configs/italian.py +14 -0
- euroeval/dataset_configs/latvian.py +14 -0
- euroeval/dataset_configs/norwegian.py +14 -0
- euroeval/dataset_configs/polish.py +126 -0
- euroeval/dataset_configs/portuguese.py +14 -0
- euroeval/dataset_configs/spanish.py +14 -0
- euroeval/dataset_configs/swedish.py +25 -0
- euroeval/enums.py +12 -0
- euroeval/generation.py +17 -8
- euroeval/generation_utils.py +65 -11
- euroeval/metrics/pipeline.py +1 -1
- euroeval/prompt_templates/linguistic_acceptability.py +9 -0
- euroeval/prompt_templates/multiple_choice.py +27 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -0
- euroeval/prompt_templates/reading_comprehension.py +11 -0
- euroeval/prompt_templates/sentiment_classification.py +15 -0
- euroeval/prompt_templates/summarization.py +27 -1
- euroeval/scores.py +5 -0
- euroeval/task_group_utils/question_answering.py +29 -29
- euroeval/task_group_utils/sequence_classification.py +11 -34
- euroeval/task_group_utils/token_classification.py +3 -3
- euroeval/tasks.py +4 -4
- euroeval/{tokenization_utils.py → tokenisation_utils.py} +50 -28
- euroeval/utils.py +36 -3
- {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/METADATA +1 -1
- euroeval-16.1.1.dist-info/RECORD +70 -0
- euroeval-16.0.1.dist-info/RECORD +0 -69
- {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/WHEEL +0 -0
- {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/entry_points.txt +0 -0
- {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -19,6 +19,7 @@ from ..languages import (
|
|
|
19
19
|
NL,
|
|
20
20
|
NN,
|
|
21
21
|
NO,
|
|
22
|
+
PL,
|
|
22
23
|
PT,
|
|
23
24
|
SV,
|
|
24
25
|
)
|
|
@@ -67,6 +68,14 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
67
68
|
default_instruction_prompt="Lause: {text}\n\nOtsusta, kas lause on "
|
|
68
69
|
"grammatiliselt õige või mitte. Vasta {labels_str}, ja mitte midagi muud.",
|
|
69
70
|
),
|
|
71
|
+
PL: PromptConfig(
|
|
72
|
+
default_prompt_label_mapping=dict(correct="tak", incorrect="nie"),
|
|
73
|
+
default_prompt_prefix="Poniżej znajdują się teksty i czy są "
|
|
74
|
+
"gramatycznie poprawne.",
|
|
75
|
+
default_prompt_template="Tekst: {text}\nGramatycznie poprawny: {label}",
|
|
76
|
+
default_instruction_prompt="Tekst: {text}\n\nOkreśl czy tekst jest "
|
|
77
|
+
"gramatycznie poprawny czy nie. Odpowiedz {labels_str}, i nic więcej.",
|
|
78
|
+
),
|
|
70
79
|
PT: PromptConfig(
|
|
71
80
|
default_prompt_label_mapping=dict(correct="sim", incorrect="não"),
|
|
72
81
|
default_prompt_prefix="Seguem-se abaixo textos e se são "
|
|
@@ -3,7 +3,25 @@
|
|
|
3
3
|
import typing as t
|
|
4
4
|
|
|
5
5
|
from ..data_models import PromptConfig
|
|
6
|
-
from ..languages import
|
|
6
|
+
from ..languages import (
|
|
7
|
+
DA,
|
|
8
|
+
DE,
|
|
9
|
+
EN,
|
|
10
|
+
ES,
|
|
11
|
+
ET,
|
|
12
|
+
FI,
|
|
13
|
+
FR,
|
|
14
|
+
IS,
|
|
15
|
+
IT,
|
|
16
|
+
LV,
|
|
17
|
+
NB,
|
|
18
|
+
NL,
|
|
19
|
+
NN,
|
|
20
|
+
NO,
|
|
21
|
+
PL,
|
|
22
|
+
PT,
|
|
23
|
+
SV,
|
|
24
|
+
)
|
|
7
25
|
|
|
8
26
|
if t.TYPE_CHECKING:
|
|
9
27
|
from ..data_models import Language
|
|
@@ -123,6 +141,14 @@ MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
123
141
|
"{labels_str}, og ikke noe annet.",
|
|
124
142
|
default_prompt_label_mapping="auto",
|
|
125
143
|
),
|
|
144
|
+
PL: PromptConfig(
|
|
145
|
+
default_prompt_prefix="Poniżej znajdują się pytania wielokrotnego wyboru "
|
|
146
|
+
"(z odpowiedziami).",
|
|
147
|
+
default_prompt_template="Pytanie: {text}\nOdpowiedź: {label}",
|
|
148
|
+
default_instruction_prompt="Pytanie: {text}\n\nOdpowiedz na powyższe pytanie, "
|
|
149
|
+
"odpowiadając {labels_str}, i nic więcej.",
|
|
150
|
+
default_prompt_label_mapping="auto",
|
|
151
|
+
),
|
|
126
152
|
SV: PromptConfig(
|
|
127
153
|
default_prompt_prefix="Följande är flervalsfrågor (med svar).",
|
|
128
154
|
default_prompt_template="Fråga: {text}\nSvar: {label}",
|
|
@@ -19,6 +19,7 @@ from ..languages import (
|
|
|
19
19
|
NL,
|
|
20
20
|
NN,
|
|
21
21
|
NO,
|
|
22
|
+
PL,
|
|
22
23
|
PT,
|
|
23
24
|
SV,
|
|
24
25
|
)
|
|
@@ -336,6 +337,25 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
336
337
|
"Verdiene skal være lister over de navngitte enhetene "
|
|
337
338
|
"av den typen, akkurat som de vises i frasen.",
|
|
338
339
|
),
|
|
340
|
+
PL: PromptConfig(
|
|
341
|
+
default_prompt_label_mapping={
|
|
342
|
+
"b-per": "osoba",
|
|
343
|
+
"i-per": "osoba",
|
|
344
|
+
"b-loc": "lokalizacja",
|
|
345
|
+
"i-loc": "lokalizacja",
|
|
346
|
+
"b-org": "organizacja",
|
|
347
|
+
"i-org": "organizacja",
|
|
348
|
+
"b-misc": "różne",
|
|
349
|
+
"i-misc": "różne",
|
|
350
|
+
},
|
|
351
|
+
default_prompt_prefix="Poniżej znajdują się zdania i słowniki JSON z nazwanymi "
|
|
352
|
+
"jednostkami występującymi w danym zdaniu.",
|
|
353
|
+
default_prompt_template="Zdanie: {text}\nNazwane jednostki: {label}",
|
|
354
|
+
default_instruction_prompt="Zdanie: {text}\n\nZidentyfikuj nazwane jednostki "
|
|
355
|
+
"w zdaniu. Powinieneś wypisać to jako słownik JSON z kluczami "
|
|
356
|
+
"{labels_str}. Wartości powinny być listami nazwanych jednostek "
|
|
357
|
+
"tego typu, dokładnie tak jak pojawiają się w zdaniu.",
|
|
358
|
+
),
|
|
339
359
|
SV: PromptConfig(
|
|
340
360
|
default_prompt_label_mapping={
|
|
341
361
|
"b-per": "person",
|
|
@@ -19,6 +19,7 @@ from ..languages import (
|
|
|
19
19
|
NL,
|
|
20
20
|
NN,
|
|
21
21
|
NO,
|
|
22
|
+
PL,
|
|
22
23
|
PT,
|
|
23
24
|
SV,
|
|
24
25
|
)
|
|
@@ -157,6 +158,16 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
157
158
|
"teksten ovenfor med maks 3 ord.\n\nSpørsmål: {question}",
|
|
158
159
|
default_prompt_label_mapping=dict(),
|
|
159
160
|
),
|
|
161
|
+
PL: PromptConfig(
|
|
162
|
+
default_prompt_prefix=(
|
|
163
|
+
"Poniżej znajdują się teksty z towarzyszącymi pytaniami i odpowiedziami."
|
|
164
|
+
),
|
|
165
|
+
default_prompt_template="Tekst: {text}\nPytanie: {question}\nOdpowiedź w "
|
|
166
|
+
"maksymalnie 3 słowach: {label}",
|
|
167
|
+
default_instruction_prompt="Tekst: {text}\n\nOdpowiedz na następujące pytanie "
|
|
168
|
+
"dotyczące powyższego tekstu w maksymalnie 3 słowach.\n\nPytanie: {question}",
|
|
169
|
+
default_prompt_label_mapping=dict(),
|
|
170
|
+
),
|
|
160
171
|
PT: PromptConfig(
|
|
161
172
|
default_prompt_prefix="Os textos que se seguem são acompanhados de perguntas "
|
|
162
173
|
"e respostas.",
|
|
@@ -19,6 +19,7 @@ from ..languages import (
|
|
|
19
19
|
NL,
|
|
20
20
|
NN,
|
|
21
21
|
NO,
|
|
22
|
+
PL,
|
|
22
23
|
PT,
|
|
23
24
|
SV,
|
|
24
25
|
)
|
|
@@ -78,6 +79,20 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
78
79
|
"meelestatuse järgi. Võimalikud vastused: {labels_str}. Muud vastused "
|
|
79
80
|
"ei ole lubatud.",
|
|
80
81
|
),
|
|
82
|
+
PL: PromptConfig(
|
|
83
|
+
default_prompt_label_mapping=dict(
|
|
84
|
+
positive="pozytywny", neutral="neutralny", negative="negatywny"
|
|
85
|
+
),
|
|
86
|
+
default_prompt_prefix=(
|
|
87
|
+
"Poniżej znajdują się dokumenty i ich sentyment, który może być "
|
|
88
|
+
"{labels_str}."
|
|
89
|
+
),
|
|
90
|
+
default_prompt_template="Dokument: {text}\nSentyment: {label}",
|
|
91
|
+
default_instruction_prompt=(
|
|
92
|
+
"Dokument: {text}\n\nKlasyfikuj sentyment w dokumencie. "
|
|
93
|
+
"Odpowiedz z {labels_str}, i nic więcej."
|
|
94
|
+
),
|
|
95
|
+
),
|
|
81
96
|
PT: PromptConfig(
|
|
82
97
|
default_prompt_label_mapping=dict(
|
|
83
98
|
positive="positivo", neutral="neutro", negative="negativo"
|
|
@@ -3,7 +3,25 @@
|
|
|
3
3
|
import typing as t
|
|
4
4
|
|
|
5
5
|
from ..data_models import PromptConfig
|
|
6
|
-
from ..languages import
|
|
6
|
+
from ..languages import (
|
|
7
|
+
DA,
|
|
8
|
+
DE,
|
|
9
|
+
EN,
|
|
10
|
+
ES,
|
|
11
|
+
ET,
|
|
12
|
+
FI,
|
|
13
|
+
FR,
|
|
14
|
+
IS,
|
|
15
|
+
IT,
|
|
16
|
+
LV,
|
|
17
|
+
NB,
|
|
18
|
+
NL,
|
|
19
|
+
NN,
|
|
20
|
+
NO,
|
|
21
|
+
PL,
|
|
22
|
+
PT,
|
|
23
|
+
SV,
|
|
24
|
+
)
|
|
7
25
|
|
|
8
26
|
if t.TYPE_CHECKING:
|
|
9
27
|
from ..data_models import Language
|
|
@@ -122,6 +140,14 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
122
140
|
"dokumentet ovenfor.",
|
|
123
141
|
default_prompt_label_mapping=dict(),
|
|
124
142
|
),
|
|
143
|
+
PL: PromptConfig(
|
|
144
|
+
default_prompt_prefix="Poniżej znajdują się artykuły z towarzyszącymi "
|
|
145
|
+
"streszczeniami.",
|
|
146
|
+
default_prompt_template="Artykuł: {text}\nStreszczenie: {target_text}",
|
|
147
|
+
default_instruction_prompt="Artykuł: {text}\n\nNapisz streszczenie "
|
|
148
|
+
"powyższego artykułu.",
|
|
149
|
+
default_prompt_label_mapping=dict(),
|
|
150
|
+
),
|
|
125
151
|
SV: PromptConfig(
|
|
126
152
|
default_prompt_prefix="Nedan följer dokument med tillhörande sammanfattningar.",
|
|
127
153
|
default_prompt_template="Dokument: {text}\nSammanfattning: {target_text}",
|
euroeval/scores.py
CHANGED
|
@@ -19,6 +19,7 @@ def log_scores(
|
|
|
19
19
|
scores: list[dict[str, float]],
|
|
20
20
|
model_id: str,
|
|
21
21
|
model_revision: str,
|
|
22
|
+
model_param: str | None,
|
|
22
23
|
) -> "ScoreDict":
|
|
23
24
|
"""Log the scores.
|
|
24
25
|
|
|
@@ -34,6 +35,8 @@ def log_scores(
|
|
|
34
35
|
The model ID of the model that was evaluated.
|
|
35
36
|
model_revision:
|
|
36
37
|
The revision of the model.
|
|
38
|
+
model_param:
|
|
39
|
+
The model parameter, if any.
|
|
37
40
|
|
|
38
41
|
Returns:
|
|
39
42
|
A dictionary with keys 'raw_scores' and 'total', with 'raw_scores' being
|
|
@@ -42,6 +45,8 @@ def log_scores(
|
|
|
42
45
|
"""
|
|
43
46
|
if model_revision and model_revision != "main":
|
|
44
47
|
model_id += f"@{model_revision}"
|
|
48
|
+
if model_param is not None:
|
|
49
|
+
model_id += f"#{model_param}"
|
|
45
50
|
|
|
46
51
|
logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
|
|
47
52
|
|
|
@@ -10,7 +10,7 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
|
|
10
10
|
from transformers.trainer import Trainer
|
|
11
11
|
|
|
12
12
|
from ..exceptions import InvalidBenchmark
|
|
13
|
-
from ..
|
|
13
|
+
from ..tokenisation_utils import get_special_token_metadata
|
|
14
14
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
15
15
|
|
|
16
16
|
if t.TYPE_CHECKING:
|
|
@@ -261,7 +261,7 @@ def prepare_train_examples(
|
|
|
261
261
|
]
|
|
262
262
|
examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
|
|
263
263
|
|
|
264
|
-
# Set the stride used during
|
|
264
|
+
# Set the stride used during tokenisation, when the context is long enough to be
|
|
265
265
|
# split into several features. Since we are always keeping the question tokens, we
|
|
266
266
|
# need to make sure that the stride does not exceed the resulting maximum context
|
|
267
267
|
# length.
|
|
@@ -272,11 +272,11 @@ def prepare_train_examples(
|
|
|
272
272
|
stride = min(stride, max_length - max_question_tokens - num_special_tokens)
|
|
273
273
|
max_length = tokeniser.model_max_length - stride
|
|
274
274
|
|
|
275
|
-
#
|
|
275
|
+
# Tokenise our examples with truncation and padding, but keep the overflows using a
|
|
276
276
|
# stride. This results in one example possible giving several features when a
|
|
277
277
|
# context is long, each of those features having a context that overlaps a bit the
|
|
278
278
|
# context of the previous feature.
|
|
279
|
-
|
|
279
|
+
tokenised_examples = tokeniser(
|
|
280
280
|
text=examples["question"],
|
|
281
281
|
text_pair=examples["context"],
|
|
282
282
|
truncation="only_second",
|
|
@@ -290,27 +290,27 @@ def prepare_train_examples(
|
|
|
290
290
|
# Since one example might give us several features if it has a long context, we
|
|
291
291
|
# need a map from a feature to its corresponding example. This key gives us just
|
|
292
292
|
# that
|
|
293
|
-
sample_mapping =
|
|
293
|
+
sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
|
|
294
294
|
|
|
295
295
|
# The offset mappings will give us a map from token to character position in the
|
|
296
296
|
# original context. This will help us compute the start_positions and
|
|
297
297
|
# end_positions.
|
|
298
|
-
offset_mapping =
|
|
298
|
+
offset_mapping = tokenised_examples.pop("offset_mapping")
|
|
299
299
|
|
|
300
300
|
# Initialise the start- and end positions of the answers
|
|
301
|
-
|
|
302
|
-
|
|
301
|
+
tokenised_examples["start_positions"] = list()
|
|
302
|
+
tokenised_examples["end_positions"] = list()
|
|
303
303
|
|
|
304
304
|
for i, offsets in enumerate(offset_mapping):
|
|
305
305
|
# Get the input IDs for the current example
|
|
306
|
-
input_ids =
|
|
306
|
+
input_ids = tokenised_examples.input_ids[i]
|
|
307
307
|
|
|
308
308
|
# We will label impossible answers with the index of the CLS token
|
|
309
309
|
cls_index = input_ids.index(cls_token_id)
|
|
310
310
|
|
|
311
311
|
# Grab the sequence corresponding to that example (to know what is the context
|
|
312
312
|
# and what is the question).
|
|
313
|
-
sequence_ids =
|
|
313
|
+
sequence_ids = tokenised_examples.sequence_ids(i)
|
|
314
314
|
|
|
315
315
|
# Manually ensure that the special tokens are set to None in `sequence_ids`
|
|
316
316
|
for special_token in tokeniser.special_tokens_map.keys():
|
|
@@ -329,8 +329,8 @@ def prepare_train_examples(
|
|
|
329
329
|
|
|
330
330
|
# If no answers are given, set the cls_index as answer.
|
|
331
331
|
if len(answers["answer_start"]) == 0:
|
|
332
|
-
|
|
333
|
-
|
|
332
|
+
tokenised_examples.start_positions.append(cls_index)
|
|
333
|
+
tokenised_examples.end_positions.append(cls_index)
|
|
334
334
|
|
|
335
335
|
else:
|
|
336
336
|
# Start/end character index of the answer in the text.
|
|
@@ -353,8 +353,8 @@ def prepare_train_examples(
|
|
|
353
353
|
offsets[token_start_index][0] <= start_char
|
|
354
354
|
and offsets[token_end_index][1] >= end_char
|
|
355
355
|
):
|
|
356
|
-
|
|
357
|
-
|
|
356
|
+
tokenised_examples.start_positions.append(cls_index)
|
|
357
|
+
tokenised_examples.end_positions.append(cls_index)
|
|
358
358
|
|
|
359
359
|
# Otherwise move the token_start_index and token_end_index to the two ends
|
|
360
360
|
# of the answer. Note: we could go after the last offset if the answer is
|
|
@@ -366,17 +366,17 @@ def prepare_train_examples(
|
|
|
366
366
|
):
|
|
367
367
|
token_start_index += 1
|
|
368
368
|
token_start_index -= 1
|
|
369
|
-
|
|
369
|
+
tokenised_examples.start_positions.append(token_start_index)
|
|
370
370
|
while (
|
|
371
371
|
token_start_index <= token_end_index
|
|
372
372
|
and offsets[token_end_index][1] >= end_char
|
|
373
373
|
):
|
|
374
374
|
token_end_index -= 1
|
|
375
375
|
token_end_index += 1
|
|
376
|
-
|
|
376
|
+
tokenised_examples.end_positions.append(token_end_index)
|
|
377
377
|
assert token_end_index >= token_start_index
|
|
378
378
|
|
|
379
|
-
return
|
|
379
|
+
return tokenised_examples
|
|
380
380
|
|
|
381
381
|
|
|
382
382
|
def prepare_test_examples(
|
|
@@ -394,7 +394,7 @@ def prepare_test_examples(
|
|
|
394
394
|
The prepared test examples.
|
|
395
395
|
"""
|
|
396
396
|
# Some of the questions have lots of whitespace on the left, which is not useful
|
|
397
|
-
# and will make the truncation of the context fail (the
|
|
397
|
+
# and will make the truncation of the context fail (the tokenised question will
|
|
398
398
|
# take a lots of space). So we remove that left whitespace
|
|
399
399
|
examples["question"] = [q.lstrip() for q in examples["question"]]
|
|
400
400
|
|
|
@@ -412,7 +412,7 @@ def prepare_test_examples(
|
|
|
412
412
|
]
|
|
413
413
|
examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
|
|
414
414
|
|
|
415
|
-
# Set the stride used during
|
|
415
|
+
# Set the stride used during tokenisation, when the context is long enough to be
|
|
416
416
|
# split into several features. Since we are always keeping the question tokens, we
|
|
417
417
|
# need to make sure that the stride does not exceed the resulting maximum context
|
|
418
418
|
# length.
|
|
@@ -423,11 +423,11 @@ def prepare_test_examples(
|
|
|
423
423
|
stride = min(stride, max_length - max_question_tokens - num_special_tokens)
|
|
424
424
|
max_length = tokeniser.model_max_length - stride
|
|
425
425
|
|
|
426
|
-
#
|
|
426
|
+
# Tokenise our examples with truncation and maybe padding, but keep the overflows
|
|
427
427
|
# using a stride. This results in one example possible giving several features when
|
|
428
428
|
# a context is long, each of those features having a context that overlaps a bit
|
|
429
429
|
# the context of the previous feature.
|
|
430
|
-
|
|
430
|
+
tokenised_examples = tokeniser(
|
|
431
431
|
text=examples["question"],
|
|
432
432
|
text_pair=examples["context"],
|
|
433
433
|
truncation="only_second",
|
|
@@ -441,30 +441,30 @@ def prepare_test_examples(
|
|
|
441
441
|
# Since one example might give us several features if it has a long context, we
|
|
442
442
|
# need a map from a feature to its corresponding example. This key gives us just
|
|
443
443
|
# that.
|
|
444
|
-
sample_mapping =
|
|
444
|
+
sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
|
|
445
445
|
|
|
446
446
|
# We keep the id that gave us this feature and we will store the offset mappings.
|
|
447
|
-
|
|
447
|
+
tokenised_examples["id"] = list()
|
|
448
448
|
|
|
449
|
-
for i in range(len(
|
|
449
|
+
for i in range(len(tokenised_examples.input_ids)):
|
|
450
450
|
# Grab the sequence corresponding to that example (to know what is the context
|
|
451
451
|
# and what is the question).
|
|
452
|
-
sequence_ids =
|
|
452
|
+
sequence_ids = tokenised_examples.sequence_ids(i)
|
|
453
453
|
context_index = 1
|
|
454
454
|
|
|
455
455
|
# One example can give several spans, this is the index of the example
|
|
456
456
|
# containing this span of text.
|
|
457
457
|
sample_index = sample_mapping[i]
|
|
458
|
-
|
|
458
|
+
tokenised_examples.id.append(examples["id"][sample_index])
|
|
459
459
|
|
|
460
460
|
# Set to (-1, -1) the offset_mapping that are not part of the context so it's
|
|
461
461
|
# easy to determine if a token position is part of the context or not.
|
|
462
|
-
|
|
462
|
+
tokenised_examples.offset_mapping[i] = [
|
|
463
463
|
(o if sequence_ids[k] == context_index else (-1, -1))
|
|
464
|
-
for k, o in enumerate(
|
|
464
|
+
for k, o in enumerate(tokenised_examples.offset_mapping[i])
|
|
465
465
|
]
|
|
466
466
|
|
|
467
|
-
return
|
|
467
|
+
return tokenised_examples
|
|
468
468
|
|
|
469
469
|
|
|
470
470
|
def postprocess_predictions_and_labels(
|
|
@@ -198,8 +198,8 @@ def extract_labels_from_generation(
|
|
|
198
198
|
# If no candidate labels were found, we either pick the label with the smallest
|
|
199
199
|
# word edit distance to the predicted label (if invalid model outputs are
|
|
200
200
|
# allowed), or we raise an error
|
|
201
|
-
if min(edit_distances)
|
|
202
|
-
if dataset_config.
|
|
201
|
+
if min(edit_distances) >= 1000:
|
|
202
|
+
if dataset_config.allow_invalid_model_outputs:
|
|
203
203
|
logger.warning(
|
|
204
204
|
"No candidate labels found for the predicted label "
|
|
205
205
|
f"{predicted_label!r}, out of the candidate labels "
|
|
@@ -296,19 +296,7 @@ def get_closest_logprobs_labels(
|
|
|
296
296
|
candidate_output_labels = {
|
|
297
297
|
candidate_label
|
|
298
298
|
for candidate_label in candidate_labels[idx]
|
|
299
|
-
if candidate_label.startswith(generated_label)
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
# If the generated label is a numeral (e.g., "1", "2", "3") and there is
|
|
303
|
-
# a matching candidate label, we only keep the full match
|
|
304
|
-
if re.match(r"^\d+$", generated_label) and any(
|
|
305
|
-
candidate_label == generated_label
|
|
306
|
-
for candidate_label in candidate_output_labels
|
|
307
|
-
):
|
|
308
|
-
candidate_output_labels = {
|
|
309
|
-
candidate_label
|
|
310
|
-
for candidate_label in candidate_output_labels
|
|
311
|
-
if candidate_label == generated_label
|
|
299
|
+
if candidate_label.startswith(generated_label.strip())
|
|
312
300
|
}
|
|
313
301
|
|
|
314
302
|
# If we can uniquely determine the output label, we break the loop.
|
|
@@ -357,19 +345,6 @@ def get_closest_logprobs_labels(
|
|
|
357
345
|
)
|
|
358
346
|
return None
|
|
359
347
|
|
|
360
|
-
# If we did not find any candidate label for any of the generated labels, we
|
|
361
|
-
# assume that something is wrong with the model output, and we fall back to
|
|
362
|
-
# using word edit distance to extract the labels
|
|
363
|
-
else:
|
|
364
|
-
log_once(
|
|
365
|
-
f"No candidate label found for any of the generated labels "
|
|
366
|
-
f"{generated_labels}. This means that using logprobs to extract "
|
|
367
|
-
"the labels is not reliable, and we will instead fall back to "
|
|
368
|
-
"extracting the labels using word edit distance.",
|
|
369
|
-
level=logging.DEBUG,
|
|
370
|
-
)
|
|
371
|
-
return None
|
|
372
|
-
|
|
373
348
|
if output_label is not None:
|
|
374
349
|
output_labels.append(output_label)
|
|
375
350
|
break
|
|
@@ -377,18 +352,20 @@ def get_closest_logprobs_labels(
|
|
|
377
352
|
if len(sample) == 0:
|
|
378
353
|
log_once(
|
|
379
354
|
"The model outputted an empty string, so no candidate labels could "
|
|
380
|
-
"be determined.
|
|
381
|
-
|
|
355
|
+
"be determined. This means that using logprobs to extract the "
|
|
356
|
+
"labels is not reliable, and we will instead fall back to "
|
|
357
|
+
"extracting the labels using word edit distance.",
|
|
382
358
|
level=logging.INFO,
|
|
383
359
|
)
|
|
384
360
|
else:
|
|
385
361
|
log_once(
|
|
386
|
-
"
|
|
387
|
-
|
|
388
|
-
|
|
362
|
+
"No candidate label found for any of the generated labels, which "
|
|
363
|
+
"means that using logprobs to extract the labels is not reliable, "
|
|
364
|
+
"and we will instead fall back to extracting the labels using "
|
|
365
|
+
"word edit distance.",
|
|
389
366
|
level=logging.INFO,
|
|
390
367
|
)
|
|
391
|
-
|
|
368
|
+
return None
|
|
392
369
|
|
|
393
370
|
assert len(output_labels) == len(generation_logprobs)
|
|
394
371
|
return output_labels
|
|
@@ -273,7 +273,7 @@ def tokenize_and_align_labels(
|
|
|
273
273
|
Returns:
|
|
274
274
|
A dictionary containing the tokenized data as well as labels.
|
|
275
275
|
"""
|
|
276
|
-
#
|
|
276
|
+
# Tokenise the texts. We use the `is_split_into_words` argument here because
|
|
277
277
|
# the texts in our dataset are lists of words (with a label for each word)
|
|
278
278
|
tokenized_inputs = tokeniser(
|
|
279
279
|
examples["tokens"], is_split_into_words=True, truncation=True, padding=True
|
|
@@ -396,7 +396,7 @@ def handle_unk_tokens(
|
|
|
396
396
|
|
|
397
397
|
Args:
|
|
398
398
|
tokeniser:
|
|
399
|
-
The tokeniser used to
|
|
399
|
+
The tokeniser used to tokenise the words.
|
|
400
400
|
tokens:
|
|
401
401
|
The list of tokens.
|
|
402
402
|
words:
|
|
@@ -423,7 +423,7 @@ def handle_unk_tokens(
|
|
|
423
423
|
# Fetch the word
|
|
424
424
|
word = words[word_idx]
|
|
425
425
|
|
|
426
|
-
#
|
|
426
|
+
# Tokenise the word, which is now a list containing at least one UNK token
|
|
427
427
|
tokens_with_unk = tokeniser.convert_ids_to_tokens(
|
|
428
428
|
tokeniser.encode(word, add_special_tokens=False)
|
|
429
429
|
)
|
euroeval/tasks.py
CHANGED
|
@@ -88,7 +88,7 @@ SUMM = Task(
|
|
|
88
88
|
default_num_few_shot_examples=1,
|
|
89
89
|
default_max_generated_tokens=256,
|
|
90
90
|
default_labels=[],
|
|
91
|
-
|
|
91
|
+
default_allowed_model_types=[ModelType.GENERATIVE],
|
|
92
92
|
)
|
|
93
93
|
|
|
94
94
|
|
|
@@ -136,14 +136,14 @@ EUROPEAN_VALUES = Task(
|
|
|
136
136
|
default_num_few_shot_examples=0,
|
|
137
137
|
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
138
138
|
default_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
|
|
139
|
-
|
|
140
|
-
|
|
139
|
+
default_allowed_model_types=[ModelType.GENERATIVE],
|
|
140
|
+
default_allowed_generative_types=[
|
|
141
141
|
GenerativeType.INSTRUCTION_TUNED,
|
|
142
142
|
GenerativeType.REASONING,
|
|
143
143
|
],
|
|
144
144
|
requires_zero_shot=True,
|
|
145
145
|
uses_logprobs=True,
|
|
146
|
-
|
|
146
|
+
default_allow_invalid_model_outputs=False,
|
|
147
147
|
)
|
|
148
148
|
|
|
149
149
|
|