EuroEval 15.6.0__py3-none-any.whl → 15.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_modules/litellm.py +136 -31
- euroeval/benchmark_modules/vllm.py +105 -38
- euroeval/benchmarker.py +12 -2
- euroeval/constants.py +1 -1
- euroeval/data_loading.py +48 -26
- euroeval/data_models.py +8 -12
- euroeval/dataset_configs/faroese.py +1 -0
- euroeval/dataset_configs/finnish.py +60 -0
- euroeval/dataset_configs/norwegian.py +1 -1
- euroeval/prompt_templates/linguistic_acceptability.py +9 -1
- euroeval/prompt_templates/multiple_choice.py +8 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -1
- euroeval/prompt_templates/reading_comprehension.py +11 -1
- euroeval/prompt_templates/sentiment_classification.py +11 -1
- euroeval/prompt_templates/summarization.py +9 -1
- euroeval/task_group_utils/sequence_classification.py +27 -32
- euroeval/task_group_utils/text_to_text.py +10 -27
- euroeval/tasks.py +1 -1
- euroeval/tokenization_utils.py +22 -6
- {euroeval-15.6.0.dist-info → euroeval-15.7.0.dist-info}/METADATA +15 -2
- {euroeval-15.6.0.dist-info → euroeval-15.7.0.dist-info}/RECORD +24 -23
- {euroeval-15.6.0.dist-info → euroeval-15.7.0.dist-info}/WHEEL +0 -0
- {euroeval-15.6.0.dist-info → euroeval-15.7.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.6.0.dist-info → euroeval-15.7.0.dist-info}/licenses/LICENSE +0 -0
euroeval/data_models.py
CHANGED
|
@@ -388,8 +388,10 @@ class DatasetConfig:
|
|
|
388
388
|
language.
|
|
389
389
|
_prompt_label_mapping (optional):
|
|
390
390
|
A mapping from the labels to another phrase which is used as a substitute
|
|
391
|
-
for the label in few-shot evaluation.
|
|
392
|
-
and
|
|
391
|
+
for the label in few-shot evaluation. If "auto" then the mapping will be set
|
|
392
|
+
to a 1:1 mapping between the labels and themselves. If None then the mapping
|
|
393
|
+
will be set to the default mapping for the task and language. Defaults to
|
|
394
|
+
None.
|
|
393
395
|
unofficial (optional):
|
|
394
396
|
Whether the dataset is unofficial. Defaults to False.
|
|
395
397
|
"""
|
|
@@ -405,7 +407,7 @@ class DatasetConfig:
|
|
|
405
407
|
_num_few_shot_examples: int | None = None
|
|
406
408
|
_max_generated_tokens: int | None = None
|
|
407
409
|
_labels: list[str] | None = None
|
|
408
|
-
_prompt_label_mapping: dict[str, str] | None = None
|
|
410
|
+
_prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
|
|
409
411
|
unofficial: bool = False
|
|
410
412
|
|
|
411
413
|
@property
|
|
@@ -475,7 +477,9 @@ class DatasetConfig:
|
|
|
475
477
|
@property
|
|
476
478
|
def prompt_label_mapping(self) -> dict[str, str]:
|
|
477
479
|
"""Mapping from English labels to localised labels."""
|
|
478
|
-
if self._prompt_label_mapping
|
|
480
|
+
if self._prompt_label_mapping == "auto":
|
|
481
|
+
return {label: label for label in self.labels}
|
|
482
|
+
elif self._prompt_label_mapping is not None:
|
|
479
483
|
return self._prompt_label_mapping
|
|
480
484
|
|
|
481
485
|
main_language = self.languages[0]
|
|
@@ -517,14 +521,6 @@ class DatasetConfig:
|
|
|
517
521
|
|
|
518
522
|
Returns:
|
|
519
523
|
The natural string representation of the labels in specified language.
|
|
520
|
-
|
|
521
|
-
Raises:
|
|
522
|
-
NotImplementedError:
|
|
523
|
-
If `and_separator` or `or_separator` are `None`, see `Language`.
|
|
524
|
-
|
|
525
|
-
Example:
|
|
526
|
-
>>> get_labels_str(language=DA)
|
|
527
|
-
"'a', 'b', 'c' eller 'd'"
|
|
528
524
|
"""
|
|
529
525
|
main_language = self.languages[0]
|
|
530
526
|
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""All Finnish dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import FI
|
|
5
|
+
from ..tasks import COMMON_SENSE, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
SCANDISENT_FI_CONFIG = DatasetConfig(
|
|
10
|
+
name="scandisent-fi",
|
|
11
|
+
pretty_name="the truncated version of the Finnish part of the binary sentiment "
|
|
12
|
+
"classification dataset ScandiSent",
|
|
13
|
+
huggingface_id="EuroEval/scandisent-fi-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[FI],
|
|
16
|
+
_labels=["negative", "positive"],
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
TURKU_NER_FI_CONFIG = DatasetConfig(
|
|
20
|
+
name="turku-ner-fi",
|
|
21
|
+
pretty_name="the Finnish part of the named entity recognition dataset Turku NER",
|
|
22
|
+
huggingface_id="EuroEval/turku-ner-fi-mini",
|
|
23
|
+
task=NER,
|
|
24
|
+
languages=[FI],
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
TYDIQA_FI_CONFIG = DatasetConfig(
|
|
28
|
+
name="tydiqa-fi",
|
|
29
|
+
pretty_name="the Finnish part of the TydiQA reading comprehension dataset",
|
|
30
|
+
huggingface_id="EuroEval/tydiqa-fi-mini",
|
|
31
|
+
task=RC,
|
|
32
|
+
languages=[FI],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
XLSUM_FI_CONFIG = DatasetConfig(
|
|
36
|
+
name="xlsum-fi",
|
|
37
|
+
pretty_name="the Finnish summarisation dataset XL-Sum",
|
|
38
|
+
huggingface_id="EuroEval/xlsum-fi-mini",
|
|
39
|
+
task=SUMM,
|
|
40
|
+
languages=[FI],
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
HELLASWAG_FI_CONFIG = DatasetConfig(
|
|
44
|
+
name="hellaswag-fi",
|
|
45
|
+
pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
|
|
46
|
+
"HellaSwag-fi, translated from the English HellaSwag dataset",
|
|
47
|
+
huggingface_id="EuroEval/hellaswag-fi-mini",
|
|
48
|
+
task=COMMON_SENSE,
|
|
49
|
+
languages=[FI],
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
SCALA_FI_CONFIG = DatasetConfig(
|
|
53
|
+
name="scala-fi",
|
|
54
|
+
pretty_name="the Finnish part of the linguistic acceptability dataset ScaLA",
|
|
55
|
+
huggingface_id="EuroEval/scala-fi",
|
|
56
|
+
task=LA,
|
|
57
|
+
languages=[FI],
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
### Unofficial datasets ###
|
|
@@ -83,6 +83,7 @@ NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
|
|
|
83
83
|
huggingface_id="EuroEval/nor-common-sense-qa",
|
|
84
84
|
task=COMMON_SENSE,
|
|
85
85
|
languages=[NB, NN, NO],
|
|
86
|
+
_labels=["a", "b", "c", "d", "e"],
|
|
86
87
|
)
|
|
87
88
|
|
|
88
89
|
|
|
@@ -105,7 +106,6 @@ NORGLM_MULTI_QA = DatasetConfig(
|
|
|
105
106
|
huggingface_id="EuroEval/norglm-multi-qa",
|
|
106
107
|
task=RC,
|
|
107
108
|
languages=[NB, NN, NO],
|
|
108
|
-
_num_few_shot_examples=2,
|
|
109
109
|
unofficial=True,
|
|
110
110
|
)
|
|
111
111
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Linguistic Acceptability task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
LA_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -36,6 +36,14 @@ LA_TEMPLATES = {
|
|
|
36
36
|
default_instruction_prompt="Texto: {text}\n\nDetermina si el texto es "
|
|
37
37
|
"gramaticalmente correcto o no. Responde con {labels_str}, y nada más.",
|
|
38
38
|
),
|
|
39
|
+
FI: PromptConfig(
|
|
40
|
+
default_prompt_label_mapping=dict(correct="kyllä", incorrect="ei"),
|
|
41
|
+
default_prompt_prefix="Seuraavat ovat lauseita ja ovatko ne "
|
|
42
|
+
"kieliopillisesti oikein.",
|
|
43
|
+
default_prompt_template="Lause: {text}\nKieliopillisesti oikein: {label}",
|
|
44
|
+
default_instruction_prompt="Lause: {text}\n\nMääritä onko lause "
|
|
45
|
+
"oikein vai ei. Vastaa {labels_str}, ja ei mitään muuta.",
|
|
46
|
+
),
|
|
39
47
|
FO: PromptConfig(
|
|
40
48
|
default_prompt_label_mapping=dict(correct="ja", incorrect="nei"),
|
|
41
49
|
default_prompt_prefix="Hetta eru nakrir setningar og um teir eru mállæruliga "
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for all multiple choice tasks."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
# TODO: Missing Faroese
|
|
7
7
|
MULTIPLE_CHOICE_TEMPLATES = {
|
|
@@ -36,6 +36,13 @@ MULTIPLE_CHOICE_TEMPLATES = {
|
|
|
36
36
|
"usando solo {labels_str}, y nada más.",
|
|
37
37
|
default_prompt_label_mapping="auto",
|
|
38
38
|
),
|
|
39
|
+
FI: PromptConfig(
|
|
40
|
+
default_prompt_prefix="Seuraavat ovat monivalintakysymyksiä (vastauksineen).",
|
|
41
|
+
default_prompt_template="Kysymys: {text}\nVastaus: {label}",
|
|
42
|
+
default_instruction_prompt="Kysymys: {text}\n\nVastaa yllä olevaan kysymykseen "
|
|
43
|
+
"käyttämällä {labels_str}, äläkä mitään muuta.",
|
|
44
|
+
default_prompt_label_mapping="auto",
|
|
45
|
+
),
|
|
39
46
|
FR: PromptConfig(
|
|
40
47
|
default_prompt_prefix="Les questions suivantes sont des questions à choix "
|
|
41
48
|
"multiples (avec réponses).",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Named Entity Recognition task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
NER_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -80,6 +80,25 @@ NER_TEMPLATES = {
|
|
|
80
80
|
"claves {labels_str}. Los valores deben ser listas de las "
|
|
81
81
|
"entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
|
|
82
82
|
),
|
|
83
|
+
FI: PromptConfig(
|
|
84
|
+
default_prompt_label_mapping={
|
|
85
|
+
"b-per": "henkilö",
|
|
86
|
+
"i-per": "henkilö",
|
|
87
|
+
"b-loc": "paikka",
|
|
88
|
+
"i-loc": "paikka",
|
|
89
|
+
"b-org": "organisaatio",
|
|
90
|
+
"i-org": "organisaatio",
|
|
91
|
+
"b-misc": "muut",
|
|
92
|
+
"i-misc": "muut",
|
|
93
|
+
},
|
|
94
|
+
default_prompt_prefix="Seuraavassa on lauseita ja JSON-sanakirjoja, jotka "
|
|
95
|
+
"sisältävät annetussa lauseessa esiintyvät nimetyt entiteetit.",
|
|
96
|
+
default_prompt_template="Lause: {text}\nNimetyt entiteetit: {label}",
|
|
97
|
+
default_instruction_prompt="Lause: {text}\n\nTunnista lauseessa olevat "
|
|
98
|
+
"entiteetit. Tulosta ne JSON-sanakirjana, jonka avaimet ovat {labels_str}. "
|
|
99
|
+
"Arvojen tulee olla listoja kyseisen tyypin nimetyistä entiteeteistä "
|
|
100
|
+
"täsmälleen siinä muodossa kuin ne esiintyvät lauseessa.",
|
|
101
|
+
),
|
|
83
102
|
FO: PromptConfig(
|
|
84
103
|
default_prompt_label_mapping={
|
|
85
104
|
"b-per": "persónur",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Reading Comprehension task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
RC_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -39,6 +39,16 @@ RC_TEMPLATES = {
|
|
|
39
39
|
"sobre el texto anterior en máximo 3 palabras.\n\nPregunta: {question}",
|
|
40
40
|
default_prompt_label_mapping=dict(),
|
|
41
41
|
),
|
|
42
|
+
FI: PromptConfig(
|
|
43
|
+
default_prompt_prefix="Seuraavassa on tekstejä ja niihin liittyviä kysymyksiä "
|
|
44
|
+
"ja vastauksia.",
|
|
45
|
+
default_prompt_template="Teksti: {text}\nKysymys: {question} "
|
|
46
|
+
"\nVastaa enintään 3 sanalla: {label}",
|
|
47
|
+
default_instruction_prompt="Teksti: {text}\n\nVastaa seuraavaan "
|
|
48
|
+
"kysymykseen yllä olevasta tekstistä enintään 3 sanalla.\n\n"
|
|
49
|
+
"Kysymys: {question}",
|
|
50
|
+
default_prompt_label_mapping=dict(),
|
|
51
|
+
),
|
|
42
52
|
FO: PromptConfig(
|
|
43
53
|
default_prompt_prefix="Hetta eru tekstir saman við spurningum og svar.",
|
|
44
54
|
default_prompt_template="Tekstur: {text}\nSpurningur: {question}\nSvara við í "
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Sentiment Analysis task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
SENT_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -44,6 +44,16 @@ SENT_TEMPLATES = {
|
|
|
44
44
|
default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
|
|
45
45
|
"documento. Responde con {labels_str}, y nada más.",
|
|
46
46
|
),
|
|
47
|
+
FI: PromptConfig(
|
|
48
|
+
default_prompt_label_mapping=dict(
|
|
49
|
+
positive="positiivinen", neutral="neutrali", negative="negatiivinen"
|
|
50
|
+
),
|
|
51
|
+
default_prompt_prefix="Seuraavassa on arvosteluja ja niiden tunnesävy, joka "
|
|
52
|
+
"voi olla {labels_str}.",
|
|
53
|
+
default_prompt_template="Teksti: {text}\nTunnesävy: {label}",
|
|
54
|
+
default_instruction_prompt="Teksti: {text}\n\nLuokittele arvostelun tunnesävy. "
|
|
55
|
+
"Vastaa vain {labels_str}, ei muuta.",
|
|
56
|
+
),
|
|
47
57
|
FO: PromptConfig(
|
|
48
58
|
default_prompt_label_mapping=dict(
|
|
49
59
|
positive="positivt", neutral="neutralt", negative="negativt"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Summarization task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
# TODO: Missing Faroese
|
|
7
7
|
SUMM_TEMPLATES = {
|
|
@@ -36,6 +36,14 @@ SUMM_TEMPLATES = {
|
|
|
36
36
|
"documento anterior.",
|
|
37
37
|
default_prompt_label_mapping=dict(),
|
|
38
38
|
),
|
|
39
|
+
FI: PromptConfig(
|
|
40
|
+
default_prompt_prefix="Seuraavassa on artikkeleita ja niihin liittyviä "
|
|
41
|
+
"tiivistelmiä.",
|
|
42
|
+
default_prompt_template="Uutisartikkeli: {text}\nTiivistelmä: {target_text}",
|
|
43
|
+
default_instruction_prompt="Uutisartikkeli: {text}\n\nKirjoita tiivistelmä "
|
|
44
|
+
"yllä olevasta artikkelista.",
|
|
45
|
+
default_prompt_label_mapping=dict(),
|
|
46
|
+
),
|
|
39
47
|
FR: PromptConfig(
|
|
40
48
|
default_prompt_prefix="Les documents suivants sont accompagnés d'un résumé.",
|
|
41
49
|
default_prompt_template="Document: {text}\nRésumé: {target_text}",
|
|
@@ -132,22 +132,23 @@ def extract_labels_from_generation(
|
|
|
132
132
|
The predicted labels.
|
|
133
133
|
"""
|
|
134
134
|
if model_output.scores is not None:
|
|
135
|
-
|
|
135
|
+
labels = get_closest_logprobs_labels(
|
|
136
136
|
generation_logprobs=model_output.scores,
|
|
137
137
|
dataset_config=dataset_config,
|
|
138
138
|
first_label_token_mapping=first_label_token_mapping,
|
|
139
139
|
)
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
140
|
+
if labels is not None:
|
|
141
|
+
return labels
|
|
142
|
+
return get_closest_word_edit_labels(
|
|
143
|
+
generated_sequences=model_output.sequences, dataset_config=dataset_config
|
|
144
|
+
)
|
|
144
145
|
|
|
145
146
|
|
|
146
147
|
def get_closest_logprobs_labels(
|
|
147
148
|
generation_logprobs: list[list[list[tuple[str, float]]]],
|
|
148
149
|
dataset_config: "DatasetConfig",
|
|
149
150
|
first_label_token_mapping: dict[str, str] | bool,
|
|
150
|
-
) -> list[str]:
|
|
151
|
+
) -> list[str] | None:
|
|
151
152
|
"""Get the labels with the highest predicted logprob value.
|
|
152
153
|
|
|
153
154
|
In case a candidate label is split into multiple tokens, we only use the first
|
|
@@ -167,7 +168,7 @@ def get_closest_logprobs_labels(
|
|
|
167
168
|
mapping is outputted then the model will always output scores).
|
|
168
169
|
|
|
169
170
|
Returns:
|
|
170
|
-
The predicted labels.
|
|
171
|
+
The predicted labels, or None if labels could not be extracted.
|
|
171
172
|
|
|
172
173
|
Raises:
|
|
173
174
|
InvalidBenchmark:
|
|
@@ -193,10 +194,7 @@ def get_closest_logprobs_labels(
|
|
|
193
194
|
# We want to use the first generated label which contains a unique candidate
|
|
194
195
|
# label, as the output label
|
|
195
196
|
output_label: str | None = None
|
|
196
|
-
|
|
197
|
-
for label_idx, generated_label in enumerate(generated_labels):
|
|
198
|
-
generated_label = "".join(previously_generated_labels) + generated_label
|
|
199
|
-
|
|
197
|
+
for generated_label in generated_labels:
|
|
200
198
|
# Get the candidate labels that starts with the generated label
|
|
201
199
|
if isinstance(first_label_token_mapping, dict):
|
|
202
200
|
if any(
|
|
@@ -222,31 +220,28 @@ def get_closest_logprobs_labels(
|
|
|
222
220
|
if candidate_label.startswith(generated_label)
|
|
223
221
|
}
|
|
224
222
|
|
|
225
|
-
# If we can uniquely determine the output label, we break the loop.
|
|
226
|
-
# there are multiple possible labels then we store the current one, and
|
|
227
|
-
# concatenate it with the next generated label. We can only do this if
|
|
228
|
-
# the current one is the first one, however, since we're using greedy
|
|
229
|
-
# sampling. In case this happens for a label that is not the first one,
|
|
230
|
-
# we warn the user.
|
|
223
|
+
# If we can uniquely determine the output label, we break the loop.
|
|
231
224
|
if len(candidate_output_labels) == 1:
|
|
232
225
|
output_label = candidate_output_labels.pop()
|
|
233
226
|
break
|
|
227
|
+
|
|
228
|
+
# If we have multiple candidate labels, we cannot uniquely determine the
|
|
229
|
+
# output label, so we abandon extracting the labels using logprobs and
|
|
230
|
+
# fall back to using word edit distance.
|
|
234
231
|
elif len(candidate_output_labels) > 1:
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
"github.com/EuroEval/EuroEval/issues."
|
|
249
|
-
)
|
|
232
|
+
log_once(
|
|
233
|
+
"Multiple candidate labels found for the generated label "
|
|
234
|
+
f"{generated_label!r}: {candidate_output_labels}. This means "
|
|
235
|
+
"that using logprobs to extract the labels is not reliable, "
|
|
236
|
+
"and we will instead fall back to extracting the labels "
|
|
237
|
+
"using word edit distance.",
|
|
238
|
+
level=logging.DEBUG,
|
|
239
|
+
)
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
# If no candidate label is found, we ignore the generated label, as it
|
|
243
|
+
# basically means that the model is just really bad at generating
|
|
244
|
+
# labels.
|
|
250
245
|
elif len(candidate_output_labels) == 0:
|
|
251
246
|
logger.debug(
|
|
252
247
|
f"No candidate label found for the generated label "
|
|
@@ -10,11 +10,7 @@ from evaluate import EvaluationModule
|
|
|
10
10
|
from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
|
|
11
11
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
12
12
|
from ..exceptions import InvalidBenchmark
|
|
13
|
-
from ..utils import
|
|
14
|
-
HiddenPrints,
|
|
15
|
-
clear_memory,
|
|
16
|
-
raise_if_model_output_contains_nan_values,
|
|
17
|
-
)
|
|
13
|
+
from ..utils import HiddenPrints, raise_if_model_output_contains_nan_values
|
|
18
14
|
|
|
19
15
|
if t.TYPE_CHECKING:
|
|
20
16
|
from transformers.trainer_utils import EvalPrediction
|
|
@@ -89,20 +85,8 @@ def compute_metrics(
|
|
|
89
85
|
score_dict: dict[str, float] | None = metric.compute(
|
|
90
86
|
predictions=predictions, references=labels, **cfg.compute_kwargs
|
|
91
87
|
)
|
|
92
|
-
|
|
93
|
-
# Clear the cache of the BERTScorer to avoid memory leaks
|
|
94
|
-
for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
|
|
95
|
-
if hasattr(metric, attribute):
|
|
96
|
-
delattr(metric, attribute)
|
|
97
|
-
|
|
98
|
-
clear_memory()
|
|
99
88
|
break
|
|
100
89
|
except Exception as e:
|
|
101
|
-
# Clear the cache of the BERTScorer to avoid memory leaks
|
|
102
|
-
if hasattr(metric, "cached_bertscorer"):
|
|
103
|
-
del metric.cached_bertscorer
|
|
104
|
-
clear_memory()
|
|
105
|
-
|
|
106
90
|
oom_error = [
|
|
107
91
|
"CUDA out of memory",
|
|
108
92
|
"CUDA error",
|
|
@@ -111,16 +95,7 @@ def compute_metrics(
|
|
|
111
95
|
if not any(error in str(e) for error in oom_error):
|
|
112
96
|
raise InvalidBenchmark(str(e))
|
|
113
97
|
|
|
114
|
-
if cfg.compute_kwargs.get("
|
|
115
|
-
batch_size = cfg.compute_kwargs["batch_size"]
|
|
116
|
-
cfg.compute_kwargs["batch_size"] = batch_size // 2
|
|
117
|
-
logger.debug(
|
|
118
|
-
"Out of memory error occurred during the computation of "
|
|
119
|
-
f"the metric {cfg.pretty_name}. Reducing the batch size to "
|
|
120
|
-
f"{cfg.compute_kwargs['batch_size']}."
|
|
121
|
-
)
|
|
122
|
-
elif cfg.compute_kwargs.get("device", "cpu") != "cpu":
|
|
123
|
-
cfg.compute_kwargs["batch_size"] = 32
|
|
98
|
+
if cfg.compute_kwargs.get("device", "cpu") != "cpu":
|
|
124
99
|
cfg.compute_kwargs["device"] = "cpu"
|
|
125
100
|
logger.debug(
|
|
126
101
|
"Out of memory error occurred during the computation of "
|
|
@@ -129,6 +104,14 @@ def compute_metrics(
|
|
|
129
104
|
)
|
|
130
105
|
else:
|
|
131
106
|
raise InvalidBenchmark(str(e))
|
|
107
|
+
finally:
|
|
108
|
+
for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
|
|
109
|
+
if hasattr(metric, attribute):
|
|
110
|
+
logger.debug(
|
|
111
|
+
f"Deleting the {attribute!r} attribute of the metric "
|
|
112
|
+
f"{cfg.pretty_name} to free up memory."
|
|
113
|
+
)
|
|
114
|
+
delattr(metric, attribute)
|
|
132
115
|
|
|
133
116
|
# The metric returns None if we are running on multi-GPU and the current
|
|
134
117
|
# process is not the main process
|
euroeval/tasks.py
CHANGED
|
@@ -142,7 +142,7 @@ SUMM = Task(
|
|
|
142
142
|
huggingface_id="bertscore",
|
|
143
143
|
results_key="f1",
|
|
144
144
|
compute_kwargs=dict(
|
|
145
|
-
model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=
|
|
145
|
+
model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
|
|
146
146
|
),
|
|
147
147
|
),
|
|
148
148
|
MetricConfig(
|
euroeval/tokenization_utils.py
CHANGED
|
@@ -7,6 +7,7 @@ import typing as t
|
|
|
7
7
|
import torch
|
|
8
8
|
|
|
9
9
|
from .constants import TASK_GROUPS_USING_LOGPROBS
|
|
10
|
+
from .enums import GenerativeType
|
|
10
11
|
from .exceptions import InvalidModel
|
|
11
12
|
from .utils import log_once
|
|
12
13
|
|
|
@@ -14,7 +15,7 @@ if t.TYPE_CHECKING:
|
|
|
14
15
|
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
15
16
|
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
|
16
17
|
|
|
17
|
-
from .data_models import DatasetConfig
|
|
18
|
+
from .data_models import DatasetConfig, ModelConfig
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger("euroeval")
|
|
@@ -254,35 +255,50 @@ def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | N
|
|
|
254
255
|
|
|
255
256
|
|
|
256
257
|
def get_first_label_token_mapping(
|
|
257
|
-
dataset_config: "DatasetConfig",
|
|
258
|
+
dataset_config: "DatasetConfig",
|
|
259
|
+
model_config: "ModelConfig",
|
|
260
|
+
tokenizer: "PreTrainedTokenizer | None",
|
|
261
|
+
generative_type: "GenerativeType | None",
|
|
258
262
|
) -> dict[str, str] | bool:
|
|
259
263
|
"""Check if the model should output scores.
|
|
260
264
|
|
|
261
265
|
Args:
|
|
262
266
|
dataset_config:
|
|
263
267
|
The dataset configuration.
|
|
268
|
+
model_config:
|
|
269
|
+
The model configuration.
|
|
264
270
|
tokenizer:
|
|
265
271
|
The tokenizer, or None if not available.
|
|
272
|
+
generative_type:
|
|
273
|
+
The generative type, or None if not available.
|
|
266
274
|
|
|
267
275
|
Returns:
|
|
268
276
|
A mapping from labels to the first token in each label, or alternatively a
|
|
269
277
|
Boolean value indicating whether the model should output scores (if the mapping
|
|
270
278
|
is outputted then the model will always output scores).
|
|
271
279
|
"""
|
|
280
|
+
if generative_type == GenerativeType.REASONING:
|
|
281
|
+
log_once(
|
|
282
|
+
f"The model {model_config.model_id!r} is a reasoning model and "
|
|
283
|
+
"thus does not support logprobs, so we do not enable it.",
|
|
284
|
+
level=logging.DEBUG,
|
|
285
|
+
)
|
|
286
|
+
return False
|
|
287
|
+
|
|
272
288
|
# If we do not have any tokenizer, then we cannot check if the model should output
|
|
273
289
|
# scores and we just assume it should if the dataset supports it
|
|
274
290
|
output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
|
|
275
291
|
if tokenizer is None:
|
|
276
292
|
if output_scores:
|
|
277
293
|
log_once(
|
|
278
|
-
"The model will output scores, since the
|
|
279
|
-
"tokenizer is available.",
|
|
294
|
+
f"The model {model_config.model_id!r} will output scores, since the "
|
|
295
|
+
"dataset supports it and no tokenizer is available.",
|
|
280
296
|
level=logging.DEBUG,
|
|
281
297
|
)
|
|
282
298
|
else:
|
|
283
299
|
log_once(
|
|
284
|
-
"The model will not output scores, since
|
|
285
|
-
"it and no tokenizer is available.",
|
|
300
|
+
f"The model {model_config.model_id!r} will not output scores, since "
|
|
301
|
+
"the dataset does not support it and no tokenizer is available.",
|
|
286
302
|
level=logging.DEBUG,
|
|
287
303
|
)
|
|
288
304
|
return output_scores
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.7.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -32,7 +32,7 @@ Requires-Python: <4.0,>=3.10
|
|
|
32
32
|
Requires-Dist: accelerate>=0.34.2
|
|
33
33
|
Requires-Dist: bert-score>=0.3.13
|
|
34
34
|
Requires-Dist: click>=8.1.3
|
|
35
|
-
Requires-Dist: datasets>=
|
|
35
|
+
Requires-Dist: datasets>=3.5.0
|
|
36
36
|
Requires-Dist: demjson3>=3.0.6
|
|
37
37
|
Requires-Dist: evaluate>=0.4.1
|
|
38
38
|
Requires-Dist: huggingface-hub>=0.30.1
|
|
@@ -237,6 +237,19 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
237
237
|
<a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
|
|
238
238
|
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
239
239
|
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
240
|
+
<a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
### Contribute to EuroEval
|
|
244
|
+
|
|
245
|
+
We welcome contributions to EuroEval! Whether you're fixing bugs, adding features, or
|
|
246
|
+
contributing new datasets, your help makes this project better for everyone.
|
|
247
|
+
|
|
248
|
+
- **General contributions**: Check out our [contribution guidelines](CONTRIBUTING.md)
|
|
249
|
+
for information on how to get started.
|
|
250
|
+
- **Adding datasets**: If you're interested in adding a new dataset to EuroEval, we have
|
|
251
|
+
a [dedicated guide](NEW_DATASET_GUIDE.md) with step-by-step instructions.
|
|
252
|
+
|
|
240
253
|
|
|
241
254
|
### Special Thanks
|
|
242
255
|
- Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
|