EuroEval 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +3 -2
- euroeval/benchmark_config_factory.py +0 -4
- euroeval/benchmark_modules/base.py +3 -16
- euroeval/benchmark_modules/fresh.py +2 -1
- euroeval/benchmark_modules/hf.py +99 -62
- euroeval/benchmark_modules/litellm.py +101 -41
- euroeval/benchmark_modules/vllm.py +91 -83
- euroeval/benchmarker.py +84 -78
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/constants.py +6 -0
- euroeval/data_loading.py +14 -11
- euroeval/data_models.py +12 -4
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/czech.py +79 -0
- euroeval/dataset_configs/danish.py +10 -11
- euroeval/dataset_configs/dutch.py +0 -1
- euroeval/dataset_configs/english.py +0 -1
- euroeval/dataset_configs/estonian.py +11 -1
- euroeval/dataset_configs/finnish.py +0 -1
- euroeval/dataset_configs/french.py +0 -1
- euroeval/dataset_configs/german.py +0 -1
- euroeval/dataset_configs/italian.py +0 -1
- euroeval/dataset_configs/latvian.py +0 -1
- euroeval/dataset_configs/lithuanian.py +9 -3
- euroeval/dataset_configs/norwegian.py +0 -1
- euroeval/dataset_configs/polish.py +0 -1
- euroeval/dataset_configs/portuguese.py +0 -1
- euroeval/dataset_configs/slovak.py +60 -0
- euroeval/dataset_configs/spanish.py +0 -1
- euroeval/dataset_configs/swedish.py +10 -12
- euroeval/finetuning.py +21 -15
- euroeval/generation.py +10 -10
- euroeval/generation_utils.py +2 -3
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +9 -5
- euroeval/metrics/llm_as_a_judge.py +5 -3
- euroeval/metrics/pipeline.py +17 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +11 -14
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/linguistic_acceptability.py +21 -3
- euroeval/prompt_templates/multiple_choice.py +25 -1
- euroeval/prompt_templates/named_entity_recognition.py +51 -11
- euroeval/prompt_templates/reading_comprehension.py +31 -3
- euroeval/prompt_templates/sentiment_classification.py +23 -1
- euroeval/prompt_templates/summarization.py +26 -6
- euroeval/scores.py +7 -7
- euroeval/speed_benchmark.py +3 -5
- euroeval/task_group_utils/multiple_choice_classification.py +0 -3
- euroeval/task_group_utils/question_answering.py +0 -3
- euroeval/task_group_utils/sequence_classification.py +43 -31
- euroeval/task_group_utils/text_to_text.py +17 -8
- euroeval/task_group_utils/token_classification.py +10 -9
- euroeval/tokenisation_utils.py +14 -12
- euroeval/utils.py +29 -146
- {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/METADATA +4 -4
- euroeval-16.4.0.dist-info/RECORD +75 -0
- euroeval-16.3.0.dist-info/RECORD +0 -71
- {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,6 +4,7 @@ import typing as t
|
|
|
4
4
|
|
|
5
5
|
from ..data_models import PromptConfig
|
|
6
6
|
from ..languages import (
|
|
7
|
+
CS,
|
|
7
8
|
DA,
|
|
8
9
|
DE,
|
|
9
10
|
EN,
|
|
@@ -22,6 +23,7 @@ from ..languages import (
|
|
|
22
23
|
NO,
|
|
23
24
|
PL,
|
|
24
25
|
PT,
|
|
26
|
+
SK,
|
|
25
27
|
SV,
|
|
26
28
|
)
|
|
27
29
|
|
|
@@ -30,6 +32,25 @@ if t.TYPE_CHECKING:
|
|
|
30
32
|
|
|
31
33
|
|
|
32
34
|
NER_TEMPLATES: dict["Language", PromptConfig] = {
|
|
35
|
+
CS: PromptConfig(
|
|
36
|
+
default_prompt_label_mapping={
|
|
37
|
+
"b-per": "osoba",
|
|
38
|
+
"i-per": "osoba",
|
|
39
|
+
"b-loc": "místo",
|
|
40
|
+
"i-loc": "místo",
|
|
41
|
+
"b-org": "organizace",
|
|
42
|
+
"i-org": "organizace",
|
|
43
|
+
"b-misc": "různé",
|
|
44
|
+
"i-misc": "různé",
|
|
45
|
+
},
|
|
46
|
+
default_prompt_prefix="Následující jsou věty a JSON slovníky s pojmenovanými "
|
|
47
|
+
"entitami, které se v dané větě vyskytují.",
|
|
48
|
+
default_prompt_template="Věta: {text}\nPojmenované entity: {label}",
|
|
49
|
+
default_instruction_prompt="Věta: {text}\n\nIdentifikujte pojmenované entity "
|
|
50
|
+
"ve větě. Měli byste to vypsat jako JSON slovník s klíči {labels_str}. "
|
|
51
|
+
"Hodnoty by měly být seznamy pojmenovaných entit tohoto typu, přesně tak, "
|
|
52
|
+
"jak se objevují ve větě.",
|
|
53
|
+
),
|
|
33
54
|
DA: PromptConfig(
|
|
34
55
|
default_prompt_label_mapping={
|
|
35
56
|
"b-per": "person",
|
|
@@ -361,20 +382,39 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
361
382
|
default_prompt_label_mapping={
|
|
362
383
|
"b-per": "osoba",
|
|
363
384
|
"i-per": "osoba",
|
|
364
|
-
"b-loc": "
|
|
365
|
-
"i-loc": "
|
|
385
|
+
"b-loc": "miejsce",
|
|
386
|
+
"i-loc": "miejsce",
|
|
366
387
|
"b-org": "organizacja",
|
|
367
388
|
"i-org": "organizacja",
|
|
368
|
-
"b-misc": "
|
|
369
|
-
"i-misc": "
|
|
389
|
+
"b-misc": "inne",
|
|
390
|
+
"i-misc": "inne",
|
|
391
|
+
},
|
|
392
|
+
default_prompt_prefix="Poniżej znajdują się zdania i słowniki JSON "
|
|
393
|
+
"z jednostkami nazewniczymi, które występują w danym zdaniu.",
|
|
394
|
+
default_prompt_template="Zdanie: {text}\nJednostki nazewnicze: {label}",
|
|
395
|
+
default_instruction_prompt="Zdanie: {text}\n\nZidentyfikuj jednostki "
|
|
396
|
+
"nazewnicze w zdaniu. Wypisz je jako słownik JSON z kluczami "
|
|
397
|
+
"{labels_str}. Wartości odpowiadające kluczom powinny być listami jednostek "
|
|
398
|
+
"nazewniczych danego typu, dokładnie tak, jak pojawiają się w zdaniu.",
|
|
399
|
+
),
|
|
400
|
+
SK: PromptConfig(
|
|
401
|
+
default_prompt_label_mapping={
|
|
402
|
+
"b-per": "osoba",
|
|
403
|
+
"i-per": "osoba",
|
|
404
|
+
"b-loc": "miesto",
|
|
405
|
+
"i-loc": "miesto",
|
|
406
|
+
"b-org": "organizácia",
|
|
407
|
+
"i-org": "organizácia",
|
|
408
|
+
"b-misc": "rôzne",
|
|
409
|
+
"i-misc": "rôzne",
|
|
370
410
|
},
|
|
371
|
-
default_prompt_prefix="
|
|
372
|
-
"
|
|
373
|
-
default_prompt_template="
|
|
374
|
-
default_instruction_prompt="
|
|
375
|
-
"
|
|
376
|
-
"{labels_str}.
|
|
377
|
-
"
|
|
411
|
+
default_prompt_prefix="Nasledujúce sú vety a JSON-objekty s pomenovanými "
|
|
412
|
+
"entitami, ktoré sa nachádzajú v danej vete.",
|
|
413
|
+
default_prompt_template="Veta: {text}\nPomenované entity: {label}",
|
|
414
|
+
default_instruction_prompt="Veta: {text}\n\nIdentifikujte pomenované "
|
|
415
|
+
"entity vo vete. Výstup by mal byť vo forme JSON-objektu s kľúčmi "
|
|
416
|
+
"{labels_str}. Hodnoty by mali byť zoznamy pomenovaných entít danej "
|
|
417
|
+
"kategórie, presne tak, ako sa vyskytujú vo vete.",
|
|
378
418
|
),
|
|
379
419
|
SV: PromptConfig(
|
|
380
420
|
default_prompt_label_mapping={
|
|
@@ -4,6 +4,7 @@ import typing as t
|
|
|
4
4
|
|
|
5
5
|
from ..data_models import PromptConfig
|
|
6
6
|
from ..languages import (
|
|
7
|
+
CS,
|
|
7
8
|
DA,
|
|
8
9
|
DE,
|
|
9
10
|
EN,
|
|
@@ -22,6 +23,7 @@ from ..languages import (
|
|
|
22
23
|
NO,
|
|
23
24
|
PL,
|
|
24
25
|
PT,
|
|
26
|
+
SK,
|
|
25
27
|
SV,
|
|
26
28
|
)
|
|
27
29
|
|
|
@@ -29,6 +31,19 @@ if t.TYPE_CHECKING:
|
|
|
29
31
|
from ..data_models import Language
|
|
30
32
|
|
|
31
33
|
RC_TEMPLATES: dict["Language", PromptConfig] = {
|
|
34
|
+
CS: PromptConfig(
|
|
35
|
+
default_prompt_prefix="Následující texty obsahují otázky a odpovědi.",
|
|
36
|
+
default_prompt_template=(
|
|
37
|
+
"Text: {text}\nOtázka: {question}\nOdpověď maximálně 3 slovy: {label}"
|
|
38
|
+
),
|
|
39
|
+
default_instruction_prompt=(
|
|
40
|
+
"Text: {text}\n\n"
|
|
41
|
+
"Odpovězte na následující otázku k výše uvedenému textu "
|
|
42
|
+
"maximálně 3 slovy.\n\n"
|
|
43
|
+
"Otázka: {question}"
|
|
44
|
+
),
|
|
45
|
+
default_prompt_label_mapping=dict(),
|
|
46
|
+
),
|
|
32
47
|
DA: PromptConfig(
|
|
33
48
|
default_prompt_prefix="Følgende er tekster med tilhørende spørgsmål og svar.",
|
|
34
49
|
default_prompt_template="Tekst: {text}\nSpørgsmål: {question}\nSvar med maks. "
|
|
@@ -172,10 +187,11 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
172
187
|
default_prompt_prefix=(
|
|
173
188
|
"Poniżej znajdują się teksty z towarzyszącymi pytaniami i odpowiedziami."
|
|
174
189
|
),
|
|
175
|
-
default_prompt_template="Tekst: {text}\nPytanie: {question}\nOdpowiedź
|
|
176
|
-
"maksymalnie 3
|
|
190
|
+
default_prompt_template="Tekst: {text}\nPytanie: {question}\nOdpowiedź z "
|
|
191
|
+
"użyciem maksymalnie 3 słów: {label}",
|
|
177
192
|
default_instruction_prompt="Tekst: {text}\n\nOdpowiedz na następujące pytanie "
|
|
178
|
-
"dotyczące powyższego tekstu
|
|
193
|
+
"dotyczące powyższego tekstu, używając maksymalnie 3 słów.\n\nPytanie: "
|
|
194
|
+
"{question}",
|
|
179
195
|
default_prompt_label_mapping=dict(),
|
|
180
196
|
),
|
|
181
197
|
PT: PromptConfig(
|
|
@@ -187,6 +203,18 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
187
203
|
"sobre o texto acima num máximo de 3 palavras.\n\nPergunta: {question}",
|
|
188
204
|
default_prompt_label_mapping=dict(),
|
|
189
205
|
),
|
|
206
|
+
SK: PromptConfig(
|
|
207
|
+
default_prompt_prefix=("Nasledujú texty s pridruženými otázkami a odpoveďami."),
|
|
208
|
+
default_prompt_template=(
|
|
209
|
+
"Text: {text}\nOtázka: {question}\nOdpoveď na maximálne 3 slová: {label}"
|
|
210
|
+
),
|
|
211
|
+
default_instruction_prompt=(
|
|
212
|
+
"Text: {text}\n\n"
|
|
213
|
+
"Odpovedzte na nasledujúcu otázku týkajúcu sa textu uvedeného vyššie "
|
|
214
|
+
"maximálne 3 slovami.\n\nOtázka: {question}"
|
|
215
|
+
),
|
|
216
|
+
default_prompt_label_mapping=dict(),
|
|
217
|
+
),
|
|
190
218
|
SV: PromptConfig(
|
|
191
219
|
default_prompt_prefix="Nedan följer texter med tillhörande frågor och svar.",
|
|
192
220
|
default_prompt_template="Text: {text}\nFråga: {question}\nSvar på max 3 ord: "
|
|
@@ -4,6 +4,7 @@ import typing as t
|
|
|
4
4
|
|
|
5
5
|
from ..data_models import PromptConfig
|
|
6
6
|
from ..languages import (
|
|
7
|
+
CS,
|
|
7
8
|
DA,
|
|
8
9
|
DE,
|
|
9
10
|
EN,
|
|
@@ -22,6 +23,7 @@ from ..languages import (
|
|
|
22
23
|
NO,
|
|
23
24
|
PL,
|
|
24
25
|
PT,
|
|
26
|
+
SK,
|
|
25
27
|
SV,
|
|
26
28
|
)
|
|
27
29
|
|
|
@@ -39,6 +41,16 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
39
41
|
default_instruction_prompt="Dokument: {text}\n\nKlassificer sentimentet i "
|
|
40
42
|
"dokumentet. Svar kun med {labels_str}, og intet andet.",
|
|
41
43
|
),
|
|
44
|
+
CS: PromptConfig(
|
|
45
|
+
default_prompt_label_mapping=dict(
|
|
46
|
+
positive="pozitivní", neutral="neutrální", negative="negativní"
|
|
47
|
+
),
|
|
48
|
+
default_prompt_prefix="Následují dokumenty a jejich sentiment, který může být "
|
|
49
|
+
"{labels_str}.",
|
|
50
|
+
default_prompt_template="Dokument: {text}\nSentiment: {label}",
|
|
51
|
+
default_instruction_prompt="Dokument: {text}\n\nKlasifikujte sentiment v "
|
|
52
|
+
"dokumentu. Odpovězte pouze s {labels_str}, a nic jiného.",
|
|
53
|
+
),
|
|
42
54
|
DE: PromptConfig(
|
|
43
55
|
default_prompt_label_mapping=dict(
|
|
44
56
|
positive="positiv", neutral="neutral", negative="negativ"
|
|
@@ -91,7 +103,7 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
91
103
|
default_prompt_template="Dokument: {text}\nSentyment: {label}",
|
|
92
104
|
default_instruction_prompt=(
|
|
93
105
|
"Dokument: {text}\n\nKlasyfikuj sentyment w dokumencie. "
|
|
94
|
-
"Odpowiedz
|
|
106
|
+
"Odpowiedz jednym słowem: {labels_str}."
|
|
95
107
|
),
|
|
96
108
|
),
|
|
97
109
|
PT: PromptConfig(
|
|
@@ -214,6 +226,16 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
214
226
|
default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
|
|
215
227
|
"teksten. Svar med {labels_str}, og ikke noe annet.",
|
|
216
228
|
),
|
|
229
|
+
SK: PromptConfig(
|
|
230
|
+
default_prompt_label_mapping=dict(
|
|
231
|
+
positive="pozitívne", neutral="neutrálne", negative="negatívne"
|
|
232
|
+
),
|
|
233
|
+
default_prompt_prefix="Nižšie sú dokumenty a ich sentiment, ktorý môže byť "
|
|
234
|
+
"{labels_str}.",
|
|
235
|
+
default_prompt_template="Dokument: {text}\nSentiment: {label}",
|
|
236
|
+
default_instruction_prompt="Dokument: {text}\n\nKlasifikujte pocit v "
|
|
237
|
+
"dokumente. Odpovedzte so {labels_str}, a nič iné.",
|
|
238
|
+
),
|
|
217
239
|
SV: PromptConfig(
|
|
218
240
|
default_prompt_label_mapping=dict(
|
|
219
241
|
positive="positiv", neutral="neutral", negative="negativ"
|
|
@@ -4,6 +4,7 @@ import typing as t
|
|
|
4
4
|
|
|
5
5
|
from ..data_models import PromptConfig
|
|
6
6
|
from ..languages import (
|
|
7
|
+
CS,
|
|
7
8
|
DA,
|
|
8
9
|
DE,
|
|
9
10
|
EN,
|
|
@@ -13,6 +14,7 @@ from ..languages import (
|
|
|
13
14
|
FR,
|
|
14
15
|
IS,
|
|
15
16
|
IT,
|
|
17
|
+
LT,
|
|
16
18
|
LV,
|
|
17
19
|
NB,
|
|
18
20
|
NL,
|
|
@@ -28,6 +30,14 @@ if t.TYPE_CHECKING:
|
|
|
28
30
|
|
|
29
31
|
# TODO: Missing Faroese
|
|
30
32
|
SUMM_TEMPLATES: dict["Language", PromptConfig] = {
|
|
33
|
+
CS: PromptConfig(
|
|
34
|
+
default_prompt_prefix=("Následující jsou dokumenty s přiloženými souhrny."),
|
|
35
|
+
default_prompt_template=("Dokument: {text}\nSouhrn: {target_text}"),
|
|
36
|
+
default_instruction_prompt=(
|
|
37
|
+
"Dokument: {text}\n\nNapište souhrn výše uvedeného dokumentu."
|
|
38
|
+
),
|
|
39
|
+
default_prompt_label_mapping=dict(),
|
|
40
|
+
),
|
|
31
41
|
DA: PromptConfig(
|
|
32
42
|
default_prompt_prefix="Følgende er dokumenter med tilhørende resuméer.",
|
|
33
43
|
default_prompt_template="Dokument: {text}\nResumé: {target_text}",
|
|
@@ -96,11 +106,14 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
96
106
|
),
|
|
97
107
|
default_prompt_label_mapping=dict(),
|
|
98
108
|
),
|
|
99
|
-
|
|
100
|
-
default_prompt_prefix=
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
"
|
|
109
|
+
LT: PromptConfig(
|
|
110
|
+
default_prompt_prefix=(
|
|
111
|
+
"Žemiau pateikiami dokumentai su pridėtomis santraukomis."
|
|
112
|
+
),
|
|
113
|
+
default_prompt_template=("Dokumentas: {text}\nSantrauka: {target_text}"),
|
|
114
|
+
default_instruction_prompt=(
|
|
115
|
+
"Dokumentas: {text}\n\nParašykite aukščiau pateikto dokumento santrauką."
|
|
116
|
+
),
|
|
104
117
|
default_prompt_label_mapping=dict(),
|
|
105
118
|
),
|
|
106
119
|
IT: PromptConfig(
|
|
@@ -111,6 +124,13 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
111
124
|
"documento di cui sopra.",
|
|
112
125
|
default_prompt_label_mapping=dict(),
|
|
113
126
|
),
|
|
127
|
+
IS: PromptConfig(
|
|
128
|
+
default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
|
|
129
|
+
default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",
|
|
130
|
+
default_instruction_prompt="Skjal: {text}\n\nSkrifaðu samantekt á ofangreindu "
|
|
131
|
+
"skjali.",
|
|
132
|
+
default_prompt_label_mapping=dict(),
|
|
133
|
+
),
|
|
114
134
|
NB: PromptConfig(
|
|
115
135
|
default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
|
|
116
136
|
default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
|
|
@@ -142,7 +162,7 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
142
162
|
),
|
|
143
163
|
PL: PromptConfig(
|
|
144
164
|
default_prompt_prefix="Poniżej znajdują się artykuły z towarzyszącymi "
|
|
145
|
-
"streszczeniami.",
|
|
165
|
+
"im streszczeniami.",
|
|
146
166
|
default_prompt_template="Artykuł: {text}\nStreszczenie: {target_text}",
|
|
147
167
|
default_instruction_prompt="Artykuł: {text}\n\nNapisz streszczenie "
|
|
148
168
|
"powyższego artykułu.",
|
euroeval/scores.py
CHANGED
|
@@ -6,12 +6,12 @@ import warnings
|
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
|
+
from .logging_utils import log
|
|
10
|
+
|
|
9
11
|
if t.TYPE_CHECKING:
|
|
10
12
|
from .metrics import Metric
|
|
11
13
|
from .types import ScoreDict
|
|
12
14
|
|
|
13
|
-
logger = logging.getLogger("euroeval")
|
|
14
|
-
|
|
15
15
|
|
|
16
16
|
def log_scores(
|
|
17
17
|
dataset_name: str,
|
|
@@ -48,9 +48,8 @@ def log_scores(
|
|
|
48
48
|
if model_param is not None:
|
|
49
49
|
model_id += f"#{model_param}"
|
|
50
50
|
|
|
51
|
-
logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
|
|
52
|
-
|
|
53
51
|
total_dict: dict[str, float] = dict()
|
|
52
|
+
all_log_strs: list[str] = [f"Finished benchmarking {model_id} on {dataset_name}."]
|
|
54
53
|
for metric in metrics:
|
|
55
54
|
test_score, test_se = aggregate_scores(scores=scores, metric=metric)
|
|
56
55
|
test_score, test_score_str = metric.postprocessing_fn(test_score)
|
|
@@ -58,11 +57,12 @@ def log_scores(
|
|
|
58
57
|
total_dict[f"test_{metric.name}"] = test_score
|
|
59
58
|
total_dict[f"test_{metric.name}_se"] = test_se
|
|
60
59
|
log_str = (
|
|
61
|
-
f"{metric.pretty_name}: {test_score_str} ± {test_se_str}"
|
|
60
|
+
f"- {metric.pretty_name}: {test_score_str} ± {test_se_str}"
|
|
62
61
|
if not np.isnan(test_se)
|
|
63
|
-
else f"{metric.pretty_name}: {test_score_str}"
|
|
62
|
+
else f"- {metric.pretty_name}: {test_score_str}"
|
|
64
63
|
)
|
|
65
|
-
|
|
64
|
+
all_log_strs.append(log_str)
|
|
65
|
+
log("\n".join(all_log_strs), level=logging.INFO)
|
|
66
66
|
|
|
67
67
|
return dict(raw=scores, total=total_dict)
|
|
68
68
|
|
euroeval/speed_benchmark.py
CHANGED
|
@@ -4,19 +4,17 @@ import logging
|
|
|
4
4
|
import typing as t
|
|
5
5
|
|
|
6
6
|
import pyinfer
|
|
7
|
-
from tqdm.auto import tqdm
|
|
8
7
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
9
8
|
|
|
10
9
|
from .benchmark_modules import HuggingFaceEncoderModel, LiteLLMModel, VLLMModel
|
|
11
10
|
from .exceptions import InvalidBenchmark
|
|
11
|
+
from .logging_utils import get_pbar, log
|
|
12
12
|
from .utils import clear_memory
|
|
13
13
|
|
|
14
14
|
if t.TYPE_CHECKING:
|
|
15
15
|
from .benchmark_modules import BenchmarkModule
|
|
16
16
|
from .data_models import BenchmarkConfig
|
|
17
17
|
|
|
18
|
-
logger = logging.getLogger("euroeval")
|
|
19
|
-
|
|
20
18
|
|
|
21
19
|
def benchmark_speed(
|
|
22
20
|
model: "BenchmarkModule", benchmark_config: "BenchmarkConfig"
|
|
@@ -33,7 +31,7 @@ def benchmark_speed(
|
|
|
33
31
|
Dictionary of scores.
|
|
34
32
|
"""
|
|
35
33
|
scores: list[dict[str, float]] = list()
|
|
36
|
-
for idx in
|
|
34
|
+
for idx in get_pbar(
|
|
37
35
|
iterable=range(benchmark_config.num_iterations),
|
|
38
36
|
desc="Benchmarking",
|
|
39
37
|
disable=not benchmark_config.progress_bar,
|
|
@@ -41,7 +39,7 @@ def benchmark_speed(
|
|
|
41
39
|
itr_scores = benchmark_speed_single_iteration(model=model, itr_idx=idx)
|
|
42
40
|
clear_memory()
|
|
43
41
|
scores.append(itr_scores)
|
|
44
|
-
|
|
42
|
+
log(f"Scores for iteration {idx}: {itr_scores}", level=logging.DEBUG)
|
|
45
43
|
return scores
|
|
46
44
|
|
|
47
45
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to the multiple-choice classification task group."""
|
|
2
2
|
|
|
3
3
|
import hashlib
|
|
4
|
-
import logging
|
|
5
4
|
import re
|
|
6
5
|
import typing as t
|
|
7
6
|
from collections import defaultdict
|
|
@@ -18,8 +17,6 @@ if t.TYPE_CHECKING:
|
|
|
18
17
|
|
|
19
18
|
from ..types import Labels, Predictions
|
|
20
19
|
|
|
21
|
-
logger = logging.getLogger("euroeval")
|
|
22
|
-
|
|
23
20
|
|
|
24
21
|
class MultipleChoiceClassificationTrainer(Trainer):
|
|
25
22
|
"""Trainer subclass for multiple-choice classification tasks."""
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to the question-answering task group."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
-
import logging
|
|
5
4
|
import typing as t
|
|
6
5
|
from collections import defaultdict
|
|
7
6
|
|
|
@@ -26,8 +25,6 @@ if t.TYPE_CHECKING:
|
|
|
26
25
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
27
26
|
from ..types import Labels, Predictions
|
|
28
27
|
|
|
29
|
-
logger = logging.getLogger("euroeval")
|
|
30
|
-
|
|
31
28
|
|
|
32
29
|
class QuestionAnsweringTrainer(Trainer):
|
|
33
30
|
"""Trainer subclass for question answering tasks."""
|
|
@@ -19,13 +19,15 @@ if t.TYPE_CHECKING:
|
|
|
19
19
|
from datasets.arrow_dataset import Dataset
|
|
20
20
|
from transformers.trainer_utils import EvalPrediction
|
|
21
21
|
|
|
22
|
-
from ..data_models import
|
|
22
|
+
from ..data_models import (
|
|
23
|
+
BenchmarkConfig,
|
|
24
|
+
DatasetConfig,
|
|
25
|
+
GenerativeModelOutput,
|
|
26
|
+
ModelConfig,
|
|
27
|
+
)
|
|
23
28
|
from ..types import Labels, Predictions
|
|
24
29
|
|
|
25
30
|
|
|
26
|
-
logger = logging.getLogger("euroeval")
|
|
27
|
-
|
|
28
|
-
|
|
29
31
|
def compute_metrics(
|
|
30
32
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
31
33
|
dataset_config: "DatasetConfig",
|
|
@@ -106,6 +108,7 @@ def extract_labels_from_generation(
|
|
|
106
108
|
input_batch: dict[str, list],
|
|
107
109
|
model_output: "GenerativeModelOutput",
|
|
108
110
|
dataset_config: "DatasetConfig",
|
|
111
|
+
model_config: "ModelConfig",
|
|
109
112
|
first_label_token_mapping: dict[str, str] | bool,
|
|
110
113
|
) -> list[str]:
|
|
111
114
|
"""Extract the predicted labels from the generated output.
|
|
@@ -118,6 +121,8 @@ def extract_labels_from_generation(
|
|
|
118
121
|
The raw generated output of the model.
|
|
119
122
|
dataset_config:
|
|
120
123
|
The configuration of the dataset.
|
|
124
|
+
model_config:
|
|
125
|
+
The configuration of the model.
|
|
121
126
|
first_label_token_mapping:
|
|
122
127
|
A mapping from labels to the first token in each label, or alternatively a
|
|
123
128
|
Boolean value indicating whether the model should output scores (if the
|
|
@@ -167,6 +172,7 @@ def extract_labels_from_generation(
|
|
|
167
172
|
)
|
|
168
173
|
|
|
169
174
|
new_predicted_labels: list[str] = list()
|
|
175
|
+
num_predictions_being_very_off = 0
|
|
170
176
|
for idx, predicted_label in enumerate(model_output.sequences):
|
|
171
177
|
# If the prediction includes a boxed answer, use that instead of the full
|
|
172
178
|
# generation
|
|
@@ -199,34 +205,40 @@ def extract_labels_from_generation(
|
|
|
199
205
|
# word edit distance to the predicted label (if invalid model outputs are
|
|
200
206
|
# allowed), or we raise an error
|
|
201
207
|
if min(edit_distances) >= 1000:
|
|
202
|
-
|
|
203
|
-
logger.warning(
|
|
204
|
-
"No candidate labels found for the predicted label "
|
|
205
|
-
f"{predicted_label!r}, out of the candidate labels "
|
|
206
|
-
f"{sample_candidate_labels[idx]}. This likely means that the model "
|
|
207
|
-
"output is completely off, but since invalid model outputs are "
|
|
208
|
-
"allowed for this task, we will use the closest candidate label "
|
|
209
|
-
f"({best_candidate_label})) as the output label. If you see this "
|
|
210
|
-
"warning very often, please report this issue to the EuroEval "
|
|
211
|
-
"team at github.com/EuroEval/EuroEval/issues."
|
|
212
|
-
)
|
|
213
|
-
logger.debug(
|
|
214
|
-
"The candidate labels were extracted from the prompt: "
|
|
215
|
-
f"{input_batch['text'][idx]!r}."
|
|
216
|
-
)
|
|
217
|
-
else:
|
|
218
|
-
raise InvalidBenchmark(
|
|
219
|
-
"No candidate labels found for the predicted label "
|
|
220
|
-
f"{predicted_label!r}, out of the candidate labels "
|
|
221
|
-
f"{sample_candidate_labels[idx]}. This likely means that the model "
|
|
222
|
-
"output is completely off, and we cannot extract any labels from "
|
|
223
|
-
"it. Please check the model output and the candidate labels. The "
|
|
224
|
-
"candidate labels were extracted from the prompt: "
|
|
225
|
-
f"{input_batch['text'][idx]!r}."
|
|
226
|
-
)
|
|
208
|
+
num_predictions_being_very_off += 1
|
|
227
209
|
|
|
228
210
|
new_predicted_labels.append(best_candidate_label)
|
|
229
211
|
|
|
212
|
+
if num_predictions_being_very_off > 0:
|
|
213
|
+
if dataset_config.allow_invalid_model_outputs:
|
|
214
|
+
log_msg = (
|
|
215
|
+
"No candidate labels found for the predicted label in "
|
|
216
|
+
f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
|
|
217
|
+
f"of the samples with the model {model_config.model_id!r}. This "
|
|
218
|
+
"likely means that the model were completely off in these cases, "
|
|
219
|
+
"but since invalid model outputs are allowed for this task, we used "
|
|
220
|
+
"the closest candidate labels as the output labels."
|
|
221
|
+
)
|
|
222
|
+
level = logging.DEBUG
|
|
223
|
+
if num_predictions_being_very_off / len(model_output.sequences) > 0.5:
|
|
224
|
+
log_msg += (
|
|
225
|
+
" Since this happened for most of the model's predictions, please "
|
|
226
|
+
"report this issue to the EuroEval team at "
|
|
227
|
+
"github.com/EuroEval/EuroEval/issues."
|
|
228
|
+
)
|
|
229
|
+
level = logging.WARNING
|
|
230
|
+
log_once(log_msg, level=level)
|
|
231
|
+
else:
|
|
232
|
+
raise InvalidBenchmark(
|
|
233
|
+
"No candidate labels found for the predicted label in "
|
|
234
|
+
f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
|
|
235
|
+
"of the samples. This likely means that the model were completely "
|
|
236
|
+
"off in these cases. Since this task does not allow invalid model "
|
|
237
|
+
"outputs, we have to abort the evaluation. Please re-run the "
|
|
238
|
+
"evaluation with the `--debug` flag (or `debug=True` if you're using "
|
|
239
|
+
"the `Benchmarker` API) to see the precise model outputs."
|
|
240
|
+
)
|
|
241
|
+
|
|
230
242
|
return new_predicted_labels
|
|
231
243
|
|
|
232
244
|
|
|
@@ -355,7 +367,7 @@ def get_closest_logprobs_labels(
|
|
|
355
367
|
"be determined. This means that using logprobs to extract the "
|
|
356
368
|
"labels is not reliable, and we will instead fall back to "
|
|
357
369
|
"extracting the labels using word edit distance.",
|
|
358
|
-
level=logging.
|
|
370
|
+
level=logging.DEBUG,
|
|
359
371
|
)
|
|
360
372
|
else:
|
|
361
373
|
log_once(
|
|
@@ -363,7 +375,7 @@ def get_closest_logprobs_labels(
|
|
|
363
375
|
"means that using logprobs to extract the labels is not reliable, "
|
|
364
376
|
"and we will instead fall back to extracting the labels using "
|
|
365
377
|
"word edit distance.",
|
|
366
|
-
level=logging.
|
|
378
|
+
level=logging.DEBUG,
|
|
367
379
|
)
|
|
368
380
|
return None
|
|
369
381
|
|
|
@@ -7,6 +7,7 @@ import numpy as np
|
|
|
7
7
|
|
|
8
8
|
from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
|
|
9
9
|
from ..exceptions import InvalidBenchmark
|
|
10
|
+
from ..logging_utils import log
|
|
10
11
|
from ..metrics import HuggingFaceMetric
|
|
11
12
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
12
13
|
|
|
@@ -18,9 +19,6 @@ if t.TYPE_CHECKING:
|
|
|
18
19
|
from ..types import Labels, Predictions
|
|
19
20
|
|
|
20
21
|
|
|
21
|
-
logger = logging.getLogger("euroeval")
|
|
22
|
-
|
|
23
|
-
|
|
24
22
|
def compute_metrics(
|
|
25
23
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
26
24
|
dataset_config: "DatasetConfig",
|
|
@@ -44,6 +42,10 @@ def compute_metrics(
|
|
|
44
42
|
Returns:
|
|
45
43
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
46
44
|
values.
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
InvalidBenchmark:
|
|
48
|
+
If the metric computation fails.
|
|
47
49
|
"""
|
|
48
50
|
model_outputs, labels = model_outputs_and_labels
|
|
49
51
|
|
|
@@ -72,7 +74,7 @@ def compute_metrics(
|
|
|
72
74
|
):
|
|
73
75
|
metric.compute_kwargs["device"] = benchmark_config.device.type
|
|
74
76
|
|
|
75
|
-
|
|
77
|
+
for _ in range(num_attempts := 5):
|
|
76
78
|
try:
|
|
77
79
|
score: float | None = metric(
|
|
78
80
|
predictions=predictions,
|
|
@@ -96,21 +98,28 @@ def compute_metrics(
|
|
|
96
98
|
and metric.compute_kwargs.get("device", "cpu") != "cpu"
|
|
97
99
|
):
|
|
98
100
|
metric.compute_kwargs["device"] = "cpu"
|
|
99
|
-
|
|
101
|
+
log(
|
|
100
102
|
"Out of memory error occurred during the computation of "
|
|
101
103
|
f"the metric {metric.pretty_name}. Moving the computation to "
|
|
102
|
-
"the CPU."
|
|
104
|
+
"the CPU.",
|
|
105
|
+
level=logging.DEBUG,
|
|
103
106
|
)
|
|
104
107
|
else:
|
|
105
108
|
raise InvalidBenchmark(str(e)) from e
|
|
106
109
|
finally:
|
|
107
110
|
for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
|
|
108
111
|
if hasattr(metric, attribute):
|
|
109
|
-
|
|
112
|
+
log(
|
|
110
113
|
f"Deleting the {attribute!r} attribute of the metric "
|
|
111
|
-
f"{metric.pretty_name} to free up memory."
|
|
114
|
+
f"{metric.pretty_name} to free up memory.",
|
|
115
|
+
level=logging.DEBUG,
|
|
112
116
|
)
|
|
113
117
|
delattr(metric, attribute)
|
|
118
|
+
else:
|
|
119
|
+
raise InvalidBenchmark(
|
|
120
|
+
f"Could not compute the metric {metric.pretty_name} after "
|
|
121
|
+
f"{num_attempts} attempts due to out of memory errors."
|
|
122
|
+
)
|
|
114
123
|
|
|
115
124
|
# The metric returns None if we are running on multi-GPU and the current
|
|
116
125
|
# process is not the main process
|
|
@@ -7,6 +7,7 @@ from copy import deepcopy
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
9
|
from ..exceptions import InvalidBenchmark
|
|
10
|
+
from ..logging_utils import log
|
|
10
11
|
from ..utils import (
|
|
11
12
|
extract_json_dict_from_string,
|
|
12
13
|
raise_if_model_output_contains_nan_values,
|
|
@@ -22,9 +23,6 @@ if t.TYPE_CHECKING:
|
|
|
22
23
|
from ..types import Labels, Predictions
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
logger = logging.getLogger("euroeval")
|
|
26
|
-
|
|
27
|
-
|
|
28
26
|
def compute_metrics(
|
|
29
27
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
30
28
|
has_misc_tags: bool,
|
|
@@ -216,17 +214,19 @@ def extract_labels_from_generation(
|
|
|
216
214
|
prompt_label_mapping = dataset_config.prompt_label_mapping
|
|
217
215
|
for prompt_tag_name, named_entities in prediction_dict.items():
|
|
218
216
|
if not isinstance(named_entities, list):
|
|
219
|
-
|
|
217
|
+
log(
|
|
220
218
|
"The model produced an invalid format for the named entities. "
|
|
221
|
-
f"Expected a list but got {type(named_entities)}. Skipping."
|
|
219
|
+
f"Expected a list but got {type(named_entities)}. Skipping.",
|
|
220
|
+
level=logging.DEBUG,
|
|
222
221
|
)
|
|
223
222
|
continue
|
|
224
223
|
try:
|
|
225
224
|
named_entities = [str(ne) for ne in named_entities]
|
|
226
225
|
except Exception:
|
|
227
|
-
|
|
226
|
+
log(
|
|
228
227
|
"The model produced an invalid format for the named entities. "
|
|
229
|
-
f"Expected a list of strings but got {named_entities}. Skipping."
|
|
228
|
+
f"Expected a list of strings but got {named_entities}. Skipping.",
|
|
229
|
+
level=logging.DEBUG,
|
|
230
230
|
)
|
|
231
231
|
continue
|
|
232
232
|
try:
|
|
@@ -236,9 +236,10 @@ def extract_labels_from_generation(
|
|
|
236
236
|
if prompt_tag == prompt_tag_name
|
|
237
237
|
][0]
|
|
238
238
|
except IndexError:
|
|
239
|
-
|
|
239
|
+
log(
|
|
240
240
|
"The model produced an invalid prompt tag name, "
|
|
241
|
-
f"{prompt_tag_name}. Skipping."
|
|
241
|
+
f"{prompt_tag_name}. Skipping.",
|
|
242
|
+
level=logging.DEBUG,
|
|
242
243
|
)
|
|
243
244
|
continue
|
|
244
245
|
|