EuroEval 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show
  1. euroeval/__init__.py +3 -2
  2. euroeval/benchmark_config_factory.py +0 -4
  3. euroeval/benchmark_modules/base.py +3 -16
  4. euroeval/benchmark_modules/fresh.py +2 -1
  5. euroeval/benchmark_modules/hf.py +99 -62
  6. euroeval/benchmark_modules/litellm.py +101 -41
  7. euroeval/benchmark_modules/vllm.py +91 -83
  8. euroeval/benchmarker.py +84 -78
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/constants.py +6 -0
  12. euroeval/data_loading.py +14 -11
  13. euroeval/data_models.py +12 -4
  14. euroeval/dataset_configs/__init__.py +2 -0
  15. euroeval/dataset_configs/czech.py +79 -0
  16. euroeval/dataset_configs/danish.py +10 -11
  17. euroeval/dataset_configs/dutch.py +0 -1
  18. euroeval/dataset_configs/english.py +0 -1
  19. euroeval/dataset_configs/estonian.py +11 -1
  20. euroeval/dataset_configs/finnish.py +0 -1
  21. euroeval/dataset_configs/french.py +0 -1
  22. euroeval/dataset_configs/german.py +0 -1
  23. euroeval/dataset_configs/italian.py +0 -1
  24. euroeval/dataset_configs/latvian.py +0 -1
  25. euroeval/dataset_configs/lithuanian.py +9 -3
  26. euroeval/dataset_configs/norwegian.py +0 -1
  27. euroeval/dataset_configs/polish.py +0 -1
  28. euroeval/dataset_configs/portuguese.py +0 -1
  29. euroeval/dataset_configs/slovak.py +60 -0
  30. euroeval/dataset_configs/spanish.py +0 -1
  31. euroeval/dataset_configs/swedish.py +10 -12
  32. euroeval/finetuning.py +21 -15
  33. euroeval/generation.py +10 -10
  34. euroeval/generation_utils.py +2 -3
  35. euroeval/logging_utils.py +250 -0
  36. euroeval/metrics/base.py +0 -3
  37. euroeval/metrics/huggingface.py +9 -5
  38. euroeval/metrics/llm_as_a_judge.py +5 -3
  39. euroeval/metrics/pipeline.py +17 -9
  40. euroeval/metrics/speed.py +0 -3
  41. euroeval/model_cache.py +11 -14
  42. euroeval/model_config.py +4 -5
  43. euroeval/model_loading.py +3 -0
  44. euroeval/prompt_templates/linguistic_acceptability.py +21 -3
  45. euroeval/prompt_templates/multiple_choice.py +25 -1
  46. euroeval/prompt_templates/named_entity_recognition.py +51 -11
  47. euroeval/prompt_templates/reading_comprehension.py +31 -3
  48. euroeval/prompt_templates/sentiment_classification.py +23 -1
  49. euroeval/prompt_templates/summarization.py +26 -6
  50. euroeval/scores.py +7 -7
  51. euroeval/speed_benchmark.py +3 -5
  52. euroeval/task_group_utils/multiple_choice_classification.py +0 -3
  53. euroeval/task_group_utils/question_answering.py +0 -3
  54. euroeval/task_group_utils/sequence_classification.py +43 -31
  55. euroeval/task_group_utils/text_to_text.py +17 -8
  56. euroeval/task_group_utils/token_classification.py +10 -9
  57. euroeval/tokenisation_utils.py +14 -12
  58. euroeval/utils.py +29 -146
  59. {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/METADATA +4 -4
  60. euroeval-16.4.0.dist-info/RECORD +75 -0
  61. euroeval-16.3.0.dist-info/RECORD +0 -71
  62. {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
  63. {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
  64. {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -4,6 +4,7 @@ import typing as t
4
4
 
5
5
  from ..data_models import PromptConfig
6
6
  from ..languages import (
7
+ CS,
7
8
  DA,
8
9
  DE,
9
10
  EN,
@@ -22,6 +23,7 @@ from ..languages import (
22
23
  NO,
23
24
  PL,
24
25
  PT,
26
+ SK,
25
27
  SV,
26
28
  )
27
29
 
@@ -30,6 +32,25 @@ if t.TYPE_CHECKING:
30
32
 
31
33
 
32
34
  NER_TEMPLATES: dict["Language", PromptConfig] = {
35
+ CS: PromptConfig(
36
+ default_prompt_label_mapping={
37
+ "b-per": "osoba",
38
+ "i-per": "osoba",
39
+ "b-loc": "místo",
40
+ "i-loc": "místo",
41
+ "b-org": "organizace",
42
+ "i-org": "organizace",
43
+ "b-misc": "různé",
44
+ "i-misc": "různé",
45
+ },
46
+ default_prompt_prefix="Následující jsou věty a JSON slovníky s pojmenovanými "
47
+ "entitami, které se v dané větě vyskytují.",
48
+ default_prompt_template="Věta: {text}\nPojmenované entity: {label}",
49
+ default_instruction_prompt="Věta: {text}\n\nIdentifikujte pojmenované entity "
50
+ "ve větě. Měli byste to vypsat jako JSON slovník s klíči {labels_str}. "
51
+ "Hodnoty by měly být seznamy pojmenovaných entit tohoto typu, přesně tak, "
52
+ "jak se objevují ve větě.",
53
+ ),
33
54
  DA: PromptConfig(
34
55
  default_prompt_label_mapping={
35
56
  "b-per": "person",
@@ -361,20 +382,39 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
361
382
  default_prompt_label_mapping={
362
383
  "b-per": "osoba",
363
384
  "i-per": "osoba",
364
- "b-loc": "lokalizacja",
365
- "i-loc": "lokalizacja",
385
+ "b-loc": "miejsce",
386
+ "i-loc": "miejsce",
366
387
  "b-org": "organizacja",
367
388
  "i-org": "organizacja",
368
- "b-misc": "różne",
369
- "i-misc": "różne",
389
+ "b-misc": "inne",
390
+ "i-misc": "inne",
391
+ },
392
+ default_prompt_prefix="Poniżej znajdują się zdania i słowniki JSON "
393
+ "z jednostkami nazewniczymi, które występują w danym zdaniu.",
394
+ default_prompt_template="Zdanie: {text}\nJednostki nazewnicze: {label}",
395
+ default_instruction_prompt="Zdanie: {text}\n\nZidentyfikuj jednostki "
396
+ "nazewnicze w zdaniu. Wypisz je jako słownik JSON z kluczami "
397
+ "{labels_str}. Wartości odpowiadające kluczom powinny być listami jednostek "
398
+ "nazewniczych danego typu, dokładnie tak, jak pojawiają się w zdaniu.",
399
+ ),
400
+ SK: PromptConfig(
401
+ default_prompt_label_mapping={
402
+ "b-per": "osoba",
403
+ "i-per": "osoba",
404
+ "b-loc": "miesto",
405
+ "i-loc": "miesto",
406
+ "b-org": "organizácia",
407
+ "i-org": "organizácia",
408
+ "b-misc": "rôzne",
409
+ "i-misc": "rôzne",
370
410
  },
371
- default_prompt_prefix="Poniżej znajdują się zdania i słowniki JSON z nazwanymi "
372
- "jednostkami występującymi w danym zdaniu.",
373
- default_prompt_template="Zdanie: {text}\nNazwane jednostki: {label}",
374
- default_instruction_prompt="Zdanie: {text}\n\nZidentyfikuj nazwane jednostki "
375
- "w zdaniu. Powinieneś wypisać to jako słownik JSON z kluczami "
376
- "{labels_str}. Wartości powinny być listami nazwanych jednostek "
377
- "tego typu, dokładnie tak jak pojawiają się w zdaniu.",
411
+ default_prompt_prefix="Nasledujúce vety a JSON-objekty s pomenovanými "
412
+ "entitami, ktoré sa nachádzajú v danej vete.",
413
+ default_prompt_template="Veta: {text}\nPomenované entity: {label}",
414
+ default_instruction_prompt="Veta: {text}\n\nIdentifikujte pomenované "
415
+ "entity vo vete. Výstup by mal byť vo forme JSON-objektu s kľúčmi "
416
+ "{labels_str}. Hodnoty by mali byť zoznamy pomenovaných entít danej "
417
+ "kategórie, presne tak, ako sa vyskytujú vo vete.",
378
418
  ),
379
419
  SV: PromptConfig(
380
420
  default_prompt_label_mapping={
@@ -4,6 +4,7 @@ import typing as t
4
4
 
5
5
  from ..data_models import PromptConfig
6
6
  from ..languages import (
7
+ CS,
7
8
  DA,
8
9
  DE,
9
10
  EN,
@@ -22,6 +23,7 @@ from ..languages import (
22
23
  NO,
23
24
  PL,
24
25
  PT,
26
+ SK,
25
27
  SV,
26
28
  )
27
29
 
@@ -29,6 +31,19 @@ if t.TYPE_CHECKING:
29
31
  from ..data_models import Language
30
32
 
31
33
  RC_TEMPLATES: dict["Language", PromptConfig] = {
34
+ CS: PromptConfig(
35
+ default_prompt_prefix="Následující texty obsahují otázky a odpovědi.",
36
+ default_prompt_template=(
37
+ "Text: {text}\nOtázka: {question}\nOdpověď maximálně 3 slovy: {label}"
38
+ ),
39
+ default_instruction_prompt=(
40
+ "Text: {text}\n\n"
41
+ "Odpovězte na následující otázku k výše uvedenému textu "
42
+ "maximálně 3 slovy.\n\n"
43
+ "Otázka: {question}"
44
+ ),
45
+ default_prompt_label_mapping=dict(),
46
+ ),
32
47
  DA: PromptConfig(
33
48
  default_prompt_prefix="Følgende er tekster med tilhørende spørgsmål og svar.",
34
49
  default_prompt_template="Tekst: {text}\nSpørgsmål: {question}\nSvar med maks. "
@@ -172,10 +187,11 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
172
187
  default_prompt_prefix=(
173
188
  "Poniżej znajdują się teksty z towarzyszącymi pytaniami i odpowiedziami."
174
189
  ),
175
- default_prompt_template="Tekst: {text}\nPytanie: {question}\nOdpowiedź w "
176
- "maksymalnie 3 słowach: {label}",
190
+ default_prompt_template="Tekst: {text}\nPytanie: {question}\nOdpowiedź z "
191
+ "użyciem maksymalnie 3 słów: {label}",
177
192
  default_instruction_prompt="Tekst: {text}\n\nOdpowiedz na następujące pytanie "
178
- "dotyczące powyższego tekstu w maksymalnie 3 słowach.\n\nPytanie: {question}",
193
+ "dotyczące powyższego tekstu, używając maksymalnie 3 słów.\n\nPytanie: "
194
+ "{question}",
179
195
  default_prompt_label_mapping=dict(),
180
196
  ),
181
197
  PT: PromptConfig(
@@ -187,6 +203,18 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
187
203
  "sobre o texto acima num máximo de 3 palavras.\n\nPergunta: {question}",
188
204
  default_prompt_label_mapping=dict(),
189
205
  ),
206
+ SK: PromptConfig(
207
+ default_prompt_prefix=("Nasledujú texty s pridruženými otázkami a odpoveďami."),
208
+ default_prompt_template=(
209
+ "Text: {text}\nOtázka: {question}\nOdpoveď na maximálne 3 slová: {label}"
210
+ ),
211
+ default_instruction_prompt=(
212
+ "Text: {text}\n\n"
213
+ "Odpovedzte na nasledujúcu otázku týkajúcu sa textu uvedeného vyššie "
214
+ "maximálne 3 slovami.\n\nOtázka: {question}"
215
+ ),
216
+ default_prompt_label_mapping=dict(),
217
+ ),
190
218
  SV: PromptConfig(
191
219
  default_prompt_prefix="Nedan följer texter med tillhörande frågor och svar.",
192
220
  default_prompt_template="Text: {text}\nFråga: {question}\nSvar på max 3 ord: "
@@ -4,6 +4,7 @@ import typing as t
4
4
 
5
5
  from ..data_models import PromptConfig
6
6
  from ..languages import (
7
+ CS,
7
8
  DA,
8
9
  DE,
9
10
  EN,
@@ -22,6 +23,7 @@ from ..languages import (
22
23
  NO,
23
24
  PL,
24
25
  PT,
26
+ SK,
25
27
  SV,
26
28
  )
27
29
 
@@ -39,6 +41,16 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
39
41
  default_instruction_prompt="Dokument: {text}\n\nKlassificer sentimentet i "
40
42
  "dokumentet. Svar kun med {labels_str}, og intet andet.",
41
43
  ),
44
+ CS: PromptConfig(
45
+ default_prompt_label_mapping=dict(
46
+ positive="pozitivní", neutral="neutrální", negative="negativní"
47
+ ),
48
+ default_prompt_prefix="Následují dokumenty a jejich sentiment, který může být "
49
+ "{labels_str}.",
50
+ default_prompt_template="Dokument: {text}\nSentiment: {label}",
51
+ default_instruction_prompt="Dokument: {text}\n\nKlasifikujte sentiment v "
52
+ "dokumentu. Odpovězte pouze s {labels_str}, a nic jiného.",
53
+ ),
42
54
  DE: PromptConfig(
43
55
  default_prompt_label_mapping=dict(
44
56
  positive="positiv", neutral="neutral", negative="negativ"
@@ -91,7 +103,7 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
91
103
  default_prompt_template="Dokument: {text}\nSentyment: {label}",
92
104
  default_instruction_prompt=(
93
105
  "Dokument: {text}\n\nKlasyfikuj sentyment w dokumencie. "
94
- "Odpowiedz z {labels_str}, i nic więcej."
106
+ "Odpowiedz jednym słowem: {labels_str}."
95
107
  ),
96
108
  ),
97
109
  PT: PromptConfig(
@@ -214,6 +226,16 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
214
226
  default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
215
227
  "teksten. Svar med {labels_str}, og ikke noe annet.",
216
228
  ),
229
+ SK: PromptConfig(
230
+ default_prompt_label_mapping=dict(
231
+ positive="pozitívne", neutral="neutrálne", negative="negatívne"
232
+ ),
233
+ default_prompt_prefix="Nižšie sú dokumenty a ich sentiment, ktorý môže byť "
234
+ "{labels_str}.",
235
+ default_prompt_template="Dokument: {text}\nSentiment: {label}",
236
+ default_instruction_prompt="Dokument: {text}\n\nKlasifikujte pocit v "
237
+ "dokumente. Odpovedzte so {labels_str}, a nič iné.",
238
+ ),
217
239
  SV: PromptConfig(
218
240
  default_prompt_label_mapping=dict(
219
241
  positive="positiv", neutral="neutral", negative="negativ"
@@ -4,6 +4,7 @@ import typing as t
4
4
 
5
5
  from ..data_models import PromptConfig
6
6
  from ..languages import (
7
+ CS,
7
8
  DA,
8
9
  DE,
9
10
  EN,
@@ -13,6 +14,7 @@ from ..languages import (
13
14
  FR,
14
15
  IS,
15
16
  IT,
17
+ LT,
16
18
  LV,
17
19
  NB,
18
20
  NL,
@@ -28,6 +30,14 @@ if t.TYPE_CHECKING:
28
30
 
29
31
  # TODO: Missing Faroese
30
32
  SUMM_TEMPLATES: dict["Language", PromptConfig] = {
33
+ CS: PromptConfig(
34
+ default_prompt_prefix=("Následující jsou dokumenty s přiloženými souhrny."),
35
+ default_prompt_template=("Dokument: {text}\nSouhrn: {target_text}"),
36
+ default_instruction_prompt=(
37
+ "Dokument: {text}\n\nNapište souhrn výše uvedeného dokumentu."
38
+ ),
39
+ default_prompt_label_mapping=dict(),
40
+ ),
31
41
  DA: PromptConfig(
32
42
  default_prompt_prefix="Følgende er dokumenter med tilhørende resuméer.",
33
43
  default_prompt_template="Dokument: {text}\nResumé: {target_text}",
@@ -96,11 +106,14 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
96
106
  ),
97
107
  default_prompt_label_mapping=dict(),
98
108
  ),
99
- IS: PromptConfig(
100
- default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
101
- default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",
102
- default_instruction_prompt="Skjal: {text}\n\nSkrifaðu samantekt á ofangreindu "
103
- "skjali.",
109
+ LT: PromptConfig(
110
+ default_prompt_prefix=(
111
+ "Žemiau pateikiami dokumentai su pridėtomis santraukomis."
112
+ ),
113
+ default_prompt_template=("Dokumentas: {text}\nSantrauka: {target_text}"),
114
+ default_instruction_prompt=(
115
+ "Dokumentas: {text}\n\nParašykite aukščiau pateikto dokumento santrauką."
116
+ ),
104
117
  default_prompt_label_mapping=dict(),
105
118
  ),
106
119
  IT: PromptConfig(
@@ -111,6 +124,13 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
111
124
  "documento di cui sopra.",
112
125
  default_prompt_label_mapping=dict(),
113
126
  ),
127
+ IS: PromptConfig(
128
+ default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
129
+ default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",
130
+ default_instruction_prompt="Skjal: {text}\n\nSkrifaðu samantekt á ofangreindu "
131
+ "skjali.",
132
+ default_prompt_label_mapping=dict(),
133
+ ),
114
134
  NB: PromptConfig(
115
135
  default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
116
136
  default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
@@ -142,7 +162,7 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
142
162
  ),
143
163
  PL: PromptConfig(
144
164
  default_prompt_prefix="Poniżej znajdują się artykuły z towarzyszącymi "
145
- "streszczeniami.",
165
+ "im streszczeniami.",
146
166
  default_prompt_template="Artykuł: {text}\nStreszczenie: {target_text}",
147
167
  default_instruction_prompt="Artykuł: {text}\n\nNapisz streszczenie "
148
168
  "powyższego artykułu.",
euroeval/scores.py CHANGED
@@ -6,12 +6,12 @@ import warnings
6
6
 
7
7
  import numpy as np
8
8
 
9
+ from .logging_utils import log
10
+
9
11
  if t.TYPE_CHECKING:
10
12
  from .metrics import Metric
11
13
  from .types import ScoreDict
12
14
 
13
- logger = logging.getLogger("euroeval")
14
-
15
15
 
16
16
  def log_scores(
17
17
  dataset_name: str,
@@ -48,9 +48,8 @@ def log_scores(
48
48
  if model_param is not None:
49
49
  model_id += f"#{model_param}"
50
50
 
51
- logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
52
-
53
51
  total_dict: dict[str, float] = dict()
52
+ all_log_strs: list[str] = [f"Finished benchmarking {model_id} on {dataset_name}."]
54
53
  for metric in metrics:
55
54
  test_score, test_se = aggregate_scores(scores=scores, metric=metric)
56
55
  test_score, test_score_str = metric.postprocessing_fn(test_score)
@@ -58,11 +57,12 @@ def log_scores(
58
57
  total_dict[f"test_{metric.name}"] = test_score
59
58
  total_dict[f"test_{metric.name}_se"] = test_se
60
59
  log_str = (
61
- f"{metric.pretty_name}: {test_score_str} ± {test_se_str}"
60
+ f"- {metric.pretty_name}: {test_score_str} ± {test_se_str}"
62
61
  if not np.isnan(test_se)
63
- else f"{metric.pretty_name}: {test_score_str}"
62
+ else f"- {metric.pretty_name}: {test_score_str}"
64
63
  )
65
- logger.info(log_str)
64
+ all_log_strs.append(log_str)
65
+ log("\n".join(all_log_strs), level=logging.INFO)
66
66
 
67
67
  return dict(raw=scores, total=total_dict)
68
68
 
@@ -4,19 +4,17 @@ import logging
4
4
  import typing as t
5
5
 
6
6
  import pyinfer
7
- from tqdm.auto import tqdm
8
7
  from transformers.models.auto.tokenization_auto import AutoTokenizer
9
8
 
10
9
  from .benchmark_modules import HuggingFaceEncoderModel, LiteLLMModel, VLLMModel
11
10
  from .exceptions import InvalidBenchmark
11
+ from .logging_utils import get_pbar, log
12
12
  from .utils import clear_memory
13
13
 
14
14
  if t.TYPE_CHECKING:
15
15
  from .benchmark_modules import BenchmarkModule
16
16
  from .data_models import BenchmarkConfig
17
17
 
18
- logger = logging.getLogger("euroeval")
19
-
20
18
 
21
19
  def benchmark_speed(
22
20
  model: "BenchmarkModule", benchmark_config: "BenchmarkConfig"
@@ -33,7 +31,7 @@ def benchmark_speed(
33
31
  Dictionary of scores.
34
32
  """
35
33
  scores: list[dict[str, float]] = list()
36
- for idx in tqdm(
34
+ for idx in get_pbar(
37
35
  iterable=range(benchmark_config.num_iterations),
38
36
  desc="Benchmarking",
39
37
  disable=not benchmark_config.progress_bar,
@@ -41,7 +39,7 @@ def benchmark_speed(
41
39
  itr_scores = benchmark_speed_single_iteration(model=model, itr_idx=idx)
42
40
  clear_memory()
43
41
  scores.append(itr_scores)
44
- logger.debug(f"Scores for iteration {idx}: {itr_scores}")
42
+ log(f"Scores for iteration {idx}: {itr_scores}", level=logging.DEBUG)
45
43
  return scores
46
44
 
47
45
 
@@ -1,7 +1,6 @@
1
1
  """Utility functions related to the multiple-choice classification task group."""
2
2
 
3
3
  import hashlib
4
- import logging
5
4
  import re
6
5
  import typing as t
7
6
  from collections import defaultdict
@@ -18,8 +17,6 @@ if t.TYPE_CHECKING:
18
17
 
19
18
  from ..types import Labels, Predictions
20
19
 
21
- logger = logging.getLogger("euroeval")
22
-
23
20
 
24
21
  class MultipleChoiceClassificationTrainer(Trainer):
25
22
  """Trainer subclass for multiple-choice classification tasks."""
@@ -1,7 +1,6 @@
1
1
  """Utility functions related to the question-answering task group."""
2
2
 
3
3
  import collections.abc as c
4
- import logging
5
4
  import typing as t
6
5
  from collections import defaultdict
7
6
 
@@ -26,8 +25,6 @@ if t.TYPE_CHECKING:
26
25
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
27
26
  from ..types import Labels, Predictions
28
27
 
29
- logger = logging.getLogger("euroeval")
30
-
31
28
 
32
29
  class QuestionAnsweringTrainer(Trainer):
33
30
  """Trainer subclass for question answering tasks."""
@@ -19,13 +19,15 @@ if t.TYPE_CHECKING:
19
19
  from datasets.arrow_dataset import Dataset
20
20
  from transformers.trainer_utils import EvalPrediction
21
21
 
22
- from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
22
+ from ..data_models import (
23
+ BenchmarkConfig,
24
+ DatasetConfig,
25
+ GenerativeModelOutput,
26
+ ModelConfig,
27
+ )
23
28
  from ..types import Labels, Predictions
24
29
 
25
30
 
26
- logger = logging.getLogger("euroeval")
27
-
28
-
29
31
  def compute_metrics(
30
32
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
31
33
  dataset_config: "DatasetConfig",
@@ -106,6 +108,7 @@ def extract_labels_from_generation(
106
108
  input_batch: dict[str, list],
107
109
  model_output: "GenerativeModelOutput",
108
110
  dataset_config: "DatasetConfig",
111
+ model_config: "ModelConfig",
109
112
  first_label_token_mapping: dict[str, str] | bool,
110
113
  ) -> list[str]:
111
114
  """Extract the predicted labels from the generated output.
@@ -118,6 +121,8 @@ def extract_labels_from_generation(
118
121
  The raw generated output of the model.
119
122
  dataset_config:
120
123
  The configuration of the dataset.
124
+ model_config:
125
+ The configuration of the model.
121
126
  first_label_token_mapping:
122
127
  A mapping from labels to the first token in each label, or alternatively a
123
128
  Boolean value indicating whether the model should output scores (if the
@@ -167,6 +172,7 @@ def extract_labels_from_generation(
167
172
  )
168
173
 
169
174
  new_predicted_labels: list[str] = list()
175
+ num_predictions_being_very_off = 0
170
176
  for idx, predicted_label in enumerate(model_output.sequences):
171
177
  # If the prediction includes a boxed answer, use that instead of the full
172
178
  # generation
@@ -199,34 +205,40 @@ def extract_labels_from_generation(
199
205
  # word edit distance to the predicted label (if invalid model outputs are
200
206
  # allowed), or we raise an error
201
207
  if min(edit_distances) >= 1000:
202
- if dataset_config.allow_invalid_model_outputs:
203
- logger.warning(
204
- "No candidate labels found for the predicted label "
205
- f"{predicted_label!r}, out of the candidate labels "
206
- f"{sample_candidate_labels[idx]}. This likely means that the model "
207
- "output is completely off, but since invalid model outputs are "
208
- "allowed for this task, we will use the closest candidate label "
209
- f"({best_candidate_label})) as the output label. If you see this "
210
- "warning very often, please report this issue to the EuroEval "
211
- "team at github.com/EuroEval/EuroEval/issues."
212
- )
213
- logger.debug(
214
- "The candidate labels were extracted from the prompt: "
215
- f"{input_batch['text'][idx]!r}."
216
- )
217
- else:
218
- raise InvalidBenchmark(
219
- "No candidate labels found for the predicted label "
220
- f"{predicted_label!r}, out of the candidate labels "
221
- f"{sample_candidate_labels[idx]}. This likely means that the model "
222
- "output is completely off, and we cannot extract any labels from "
223
- "it. Please check the model output and the candidate labels. The "
224
- "candidate labels were extracted from the prompt: "
225
- f"{input_batch['text'][idx]!r}."
226
- )
208
+ num_predictions_being_very_off += 1
227
209
 
228
210
  new_predicted_labels.append(best_candidate_label)
229
211
 
212
+ if num_predictions_being_very_off > 0:
213
+ if dataset_config.allow_invalid_model_outputs:
214
+ log_msg = (
215
+ "No candidate labels found for the predicted label in "
216
+ f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
217
+ f"of the samples with the model {model_config.model_id!r}. This "
218
+ "likely means that the model were completely off in these cases, "
219
+ "but since invalid model outputs are allowed for this task, we used "
220
+ "the closest candidate labels as the output labels."
221
+ )
222
+ level = logging.DEBUG
223
+ if num_predictions_being_very_off / len(model_output.sequences) > 0.5:
224
+ log_msg += (
225
+ " Since this happened for most of the model's predictions, please "
226
+ "report this issue to the EuroEval team at "
227
+ "github.com/EuroEval/EuroEval/issues."
228
+ )
229
+ level = logging.WARNING
230
+ log_once(log_msg, level=level)
231
+ else:
232
+ raise InvalidBenchmark(
233
+ "No candidate labels found for the predicted label in "
234
+ f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
235
+ "of the samples. This likely means that the model were completely "
236
+ "off in these cases. Since this task does not allow invalid model "
237
+ "outputs, we have to abort the evaluation. Please re-run the "
238
+ "evaluation with the `--debug` flag (or `debug=True` if you're using "
239
+ "the `Benchmarker` API) to see the precise model outputs."
240
+ )
241
+
230
242
  return new_predicted_labels
231
243
 
232
244
 
@@ -355,7 +367,7 @@ def get_closest_logprobs_labels(
355
367
  "be determined. This means that using logprobs to extract the "
356
368
  "labels is not reliable, and we will instead fall back to "
357
369
  "extracting the labels using word edit distance.",
358
- level=logging.INFO,
370
+ level=logging.DEBUG,
359
371
  )
360
372
  else:
361
373
  log_once(
@@ -363,7 +375,7 @@ def get_closest_logprobs_labels(
363
375
  "means that using logprobs to extract the labels is not reliable, "
364
376
  "and we will instead fall back to extracting the labels using "
365
377
  "word edit distance.",
366
- level=logging.INFO,
378
+ level=logging.DEBUG,
367
379
  )
368
380
  return None
369
381
 
@@ -7,6 +7,7 @@ import numpy as np
7
7
 
8
8
  from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
9
9
  from ..exceptions import InvalidBenchmark
10
+ from ..logging_utils import log
10
11
  from ..metrics import HuggingFaceMetric
11
12
  from ..utils import raise_if_model_output_contains_nan_values
12
13
 
@@ -18,9 +19,6 @@ if t.TYPE_CHECKING:
18
19
  from ..types import Labels, Predictions
19
20
 
20
21
 
21
- logger = logging.getLogger("euroeval")
22
-
23
-
24
22
  def compute_metrics(
25
23
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
26
24
  dataset_config: "DatasetConfig",
@@ -44,6 +42,10 @@ def compute_metrics(
44
42
  Returns:
45
43
  A dictionary with the names of the metrics as keys and the metric values as
46
44
  values.
45
+
46
+ Raises:
47
+ InvalidBenchmark:
48
+ If the metric computation fails.
47
49
  """
48
50
  model_outputs, labels = model_outputs_and_labels
49
51
 
@@ -72,7 +74,7 @@ def compute_metrics(
72
74
  ):
73
75
  metric.compute_kwargs["device"] = benchmark_config.device.type
74
76
 
75
- while True:
77
+ for _ in range(num_attempts := 5):
76
78
  try:
77
79
  score: float | None = metric(
78
80
  predictions=predictions,
@@ -96,21 +98,28 @@ def compute_metrics(
96
98
  and metric.compute_kwargs.get("device", "cpu") != "cpu"
97
99
  ):
98
100
  metric.compute_kwargs["device"] = "cpu"
99
- logger.debug(
101
+ log(
100
102
  "Out of memory error occurred during the computation of "
101
103
  f"the metric {metric.pretty_name}. Moving the computation to "
102
- "the CPU."
104
+ "the CPU.",
105
+ level=logging.DEBUG,
103
106
  )
104
107
  else:
105
108
  raise InvalidBenchmark(str(e)) from e
106
109
  finally:
107
110
  for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
108
111
  if hasattr(metric, attribute):
109
- logger.debug(
112
+ log(
110
113
  f"Deleting the {attribute!r} attribute of the metric "
111
- f"{metric.pretty_name} to free up memory."
114
+ f"{metric.pretty_name} to free up memory.",
115
+ level=logging.DEBUG,
112
116
  )
113
117
  delattr(metric, attribute)
118
+ else:
119
+ raise InvalidBenchmark(
120
+ f"Could not compute the metric {metric.pretty_name} after "
121
+ f"{num_attempts} attempts due to out of memory errors."
122
+ )
114
123
 
115
124
  # The metric returns None if we are running on multi-GPU and the current
116
125
  # process is not the main process
@@ -7,6 +7,7 @@ from copy import deepcopy
7
7
  import numpy as np
8
8
 
9
9
  from ..exceptions import InvalidBenchmark
10
+ from ..logging_utils import log
10
11
  from ..utils import (
11
12
  extract_json_dict_from_string,
12
13
  raise_if_model_output_contains_nan_values,
@@ -22,9 +23,6 @@ if t.TYPE_CHECKING:
22
23
  from ..types import Labels, Predictions
23
24
 
24
25
 
25
- logger = logging.getLogger("euroeval")
26
-
27
-
28
26
  def compute_metrics(
29
27
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
30
28
  has_misc_tags: bool,
@@ -216,17 +214,19 @@ def extract_labels_from_generation(
216
214
  prompt_label_mapping = dataset_config.prompt_label_mapping
217
215
  for prompt_tag_name, named_entities in prediction_dict.items():
218
216
  if not isinstance(named_entities, list):
219
- logger.debug(
217
+ log(
220
218
  "The model produced an invalid format for the named entities. "
221
- f"Expected a list but got {type(named_entities)}. Skipping."
219
+ f"Expected a list but got {type(named_entities)}. Skipping.",
220
+ level=logging.DEBUG,
222
221
  )
223
222
  continue
224
223
  try:
225
224
  named_entities = [str(ne) for ne in named_entities]
226
225
  except Exception:
227
- logger.debug(
226
+ log(
228
227
  "The model produced an invalid format for the named entities. "
229
- f"Expected a list of strings but got {named_entities}. Skipping."
228
+ f"Expected a list of strings but got {named_entities}. Skipping.",
229
+ level=logging.DEBUG,
230
230
  )
231
231
  continue
232
232
  try:
@@ -236,9 +236,10 @@ def extract_labels_from_generation(
236
236
  if prompt_tag == prompt_tag_name
237
237
  ][0]
238
238
  except IndexError:
239
- logger.debug(
239
+ log(
240
240
  "The model produced an invalid prompt tag name, "
241
- f"{prompt_tag_name}. Skipping."
241
+ f"{prompt_tag_name}. Skipping.",
242
+ level=logging.DEBUG,
242
243
  )
243
244
  continue
244
245