EuroEval 15.2.0__py3-none-any.whl → 15.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -221,7 +221,9 @@ def load_model_and_tokenizer(
221
221
 
222
222
  match dataset_config.task.task_group:
223
223
  case (
224
- TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
224
+ TaskGroup.SEQUENCE_CLASSIFICATION
225
+ | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
226
+ | TaskGroup.SPEED
225
227
  ):
226
228
  model_cls_mapping = dict(
227
229
  fresh_xlm_roberta_base=XLMRobertaForSequenceClassification,
@@ -1151,7 +1151,7 @@ def get_end_of_reasoning_token_id(
1151
1151
  ):
1152
1152
  log_once(
1153
1153
  message=(
1154
- f"Detected reasoning token {reasoning_token!r} and end of reasoning "
1154
+ f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1155
1155
  f"token {end_of_reasoning_token!r}, but one of them is not registered "
1156
1156
  "as a special token, so assuming it is not a real reasoning token."
1157
1157
  ),
@@ -1160,7 +1160,11 @@ def get_end_of_reasoning_token_id(
1160
1160
  return None
1161
1161
 
1162
1162
  log_once(
1163
- message=f"Detected reasoning token {reasoning_token!r}.", level=logging.DEBUG
1163
+ message=(
1164
+ f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1165
+ f"token {end_of_reasoning_token!r}."
1166
+ ),
1167
+ level=logging.DEBUG,
1164
1168
  )
1165
1169
 
1166
1170
  # Encode the end of reasoning token and return its ID
@@ -1,7 +1,7 @@
1
1
  """All dataset configurations used in EuroEval."""
2
2
 
3
3
  from .data_models import DatasetConfig
4
- from .languages import DA, DE, EN, FO, FR, IS, NB, NL, NN, NO, SV, get_all_languages
4
+ from .languages import DA, DE, EN, FO, FR, IS, IT, NB, NL, NN, NO, SV, get_all_languages
5
5
  from .tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SPEED, SUMM
6
6
 
7
7
 
@@ -244,6 +244,26 @@ ALLOCINE_CONFIG = DatasetConfig(
244
244
  max_generated_tokens=5,
245
245
  )
246
246
 
247
+ SENTIPOLC_CONFIG = DatasetConfig(
248
+ name="sentipolc16",
249
+ pretty_name="the truncated version of the Italian sentiment classification "
250
+ "dataset Sentipolc-16",
251
+ huggingface_id="EuroEval/sentipolc16-mini",
252
+ task=SENT,
253
+ languages=[IT],
254
+ labels=["negative", "neutral", "positive"],
255
+ prompt_prefix="Di seguito sono riportati i testi e il loro sentimento, che può "
256
+ "essere 'positivo', 'neutro' o 'negativo'.",
257
+ prompt_template="Tweet: {text}\nSentimento: {label}",
258
+ prompt_label_mapping=dict(
259
+ positive="positivo", neutral="neutro", negative="negativo"
260
+ ),
261
+ instruction_prompt="Tweet: {text}\n\nClassificare il sentimento nel Tweet. "
262
+ "Rispondete con 'positivo', 'neutro' o 'negativo', e nient'altro.",
263
+ num_few_shot_examples=12,
264
+ max_generated_tokens=5,
265
+ )
266
+
247
267
 
248
268
  ### NAMED ENTITY RECOGNITION DATASETS ###
249
269
 
@@ -718,6 +738,85 @@ WIKIANN_FO_CONFIG = DatasetConfig(
718
738
  unofficial=True,
719
739
  )
720
740
 
741
+ WIKINEURAL_IT_CONFIG = DatasetConfig(
742
+ name="wikineural-it",
743
+ pretty_name="the truncated version of the Italian named "
744
+ "entity recognition dataset WikiNEuRal IT",
745
+ huggingface_id="EuroEval/wikineural-mini-it",
746
+ task=NER,
747
+ languages=[IT],
748
+ labels=[
749
+ "o",
750
+ "b-loc",
751
+ "i-loc",
752
+ "b-org",
753
+ "i-org",
754
+ "b-per",
755
+ "i-per",
756
+ "b-misc",
757
+ "i-misc",
758
+ ],
759
+ prompt_prefix="Di seguito sono riportate le frasi e i dizionari JSON con le entità "
760
+ "denominate presenti nella frase data.",
761
+ prompt_template="Frase: {text}\nEntità denominate: {label}",
762
+ prompt_label_mapping={
763
+ "b-per": "persona",
764
+ "i-per": "persona",
765
+ "b-loc": "posizione",
766
+ "i-loc": "posizione",
767
+ "b-org": "organizzazione",
768
+ "i-org": "organizzazione",
769
+ "b-misc": "varie",
770
+ "i-misc": "varie",
771
+ },
772
+ instruction_prompt="Frase: {text}\n\nIdentificare le entità nominate nella frase. "
773
+ "Il risultato dovrebbe essere un dizionario JSON con le chiavi 'persona', "
774
+ "'posizione', 'organizzazione' e 'varie'. I valori devono essere elenchi di entità "
775
+ "nominate di quel tipo, esattamente come appaiono nella frase.",
776
+ num_few_shot_examples=8,
777
+ max_generated_tokens=128,
778
+ unofficial=True,
779
+ )
780
+
781
+ MULTINERD_IT_CONFIG = DatasetConfig(
782
+ name="multinerd-it",
783
+ pretty_name="the truncated version of the Italian part of the named "
784
+ "entity recognition dataset MultiNERD",
785
+ huggingface_id="EuroEval/multinerd-mini-it",
786
+ task=NER,
787
+ languages=[IT],
788
+ labels=[
789
+ "o",
790
+ "b-loc",
791
+ "i-loc",
792
+ "b-org",
793
+ "i-org",
794
+ "b-per",
795
+ "i-per",
796
+ "b-misc",
797
+ "i-misc",
798
+ ],
799
+ prompt_prefix="Di seguito sono riportate le frasi e i dizionari JSON con le entità "
800
+ "denominate presenti nella frase data.",
801
+ prompt_template="Frase: {text}\nEntità denominate: {label}",
802
+ prompt_label_mapping={
803
+ "b-per": "persona",
804
+ "i-per": "persona",
805
+ "b-loc": "posizione",
806
+ "i-loc": "posizione",
807
+ "b-org": "organizzazione",
808
+ "i-org": "organizzazione",
809
+ "b-misc": "varie",
810
+ "i-misc": "varie",
811
+ },
812
+ instruction_prompt="Frase: {text}\n\nIdentificare le entità nominate nella frase. "
813
+ "Il risultato dovrebbe essere un dizionario JSON con le chiavi 'persona', "
814
+ "'posizione', 'organizzazione' e 'varie'. I valori devono essere elenchi di entità "
815
+ "nominate di quel tipo, esattamente come appaiono nella frase.",
816
+ num_few_shot_examples=8,
817
+ max_generated_tokens=128,
818
+ )
819
+
721
820
 
722
821
  ### LINGUISTIC ACCEPTABILITY DATASETS ###
723
822
 
@@ -789,6 +888,25 @@ SCALA_NN_CONFIG = DatasetConfig(
789
888
  max_generated_tokens=5,
790
889
  )
791
890
 
891
+ NO_COLA_CONFIG = DatasetConfig(
892
+ name="no-cola",
893
+ pretty_name="the truncated version of the Norwegian linguistic acceptability "
894
+ "dataset NoCoLA",
895
+ huggingface_id="EuroEval/no-cola-mini",
896
+ task=LA,
897
+ languages=[NB, NO],
898
+ labels=["incorrect", "correct"],
899
+ prompt_prefix="Følgende er setninger og hvorvidt de er grammatisk korrekte.",
900
+ prompt_template="Setning: {text}\nGrammatisk korrekt: {label}",
901
+ instruction_prompt="Setning: {text}\n\nBestem om setningen er grammatisk korrekt "
902
+ "eller ikke. Svar med 'ja' hvis setningen er korrekt og 'nei' hvis den ikke er, "
903
+ "og ikke noe annet.",
904
+ prompt_label_mapping=dict(correct="ja", incorrect="nei"),
905
+ num_few_shot_examples=12,
906
+ max_generated_tokens=5,
907
+ unofficial=True,
908
+ )
909
+
792
910
  SCALA_IS_CONFIG = DatasetConfig(
793
911
  name="scala-is",
794
912
  pretty_name="the Icelandic part of the linguistic acceptability dataset ScaLA",
@@ -893,6 +1011,24 @@ SCALA_FR_CONFIG = DatasetConfig(
893
1011
  max_generated_tokens=5,
894
1012
  )
895
1013
 
1014
+ SCALA_IT_CONFIG = DatasetConfig(
1015
+ name="scala-it",
1016
+ pretty_name="the Italian part of the linguistic acceptability dataset ScaLA",
1017
+ huggingface_id="EuroEval/scala-it",
1018
+ task=LA,
1019
+ languages=[IT],
1020
+ labels=["incorrect", "correct"],
1021
+ prompt_prefix="Di seguito sono riportate le frasi e la loro correttezza "
1022
+ "grammaticale.",
1023
+ prompt_template="Frase : {text}\nGrammaticalmente corretto : {label}",
1024
+ prompt_label_mapping=dict(correct="si", incorrect="no"),
1025
+ instruction_prompt="Frase: {text}\n\nStabilite se la frase è grammaticalmente "
1026
+ "corretta o meno. Rispondete con 'si' se la frase è corretta e con 'no' se "
1027
+ "non lo è, e nient'altro.",
1028
+ num_few_shot_examples=12,
1029
+ max_generated_tokens=5,
1030
+ )
1031
+
896
1032
  DUTCH_COLA_CONFIG = DatasetConfig(
897
1033
  name="dutch-cola",
898
1034
  pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
@@ -1139,10 +1275,26 @@ SQUAD_NL_CONFIG = DatasetConfig(
1139
1275
  max_generated_tokens=32,
1140
1276
  )
1141
1277
 
1278
+ SQUAD_IT_CONFIG = DatasetConfig(
1279
+ name="squad-it",
1280
+ pretty_name="the truncated version of the Italian reading comprehension dataset "
1281
+ "SQuAD-it, translated from the English SQuAD dataset",
1282
+ huggingface_id="EuroEval/squad-it-mini",
1283
+ task=RC,
1284
+ languages=[IT],
1285
+ labels=["start_positions", "end_positions"],
1286
+ prompt_prefix="I testi che seguono sono accompagnati da domande e risposte.",
1287
+ prompt_template="Testo: {text}\nDomanda: {question}\nRispondere in massimo "
1288
+ "3 parole: {label}",
1289
+ instruction_prompt="Testo: {text}\n\nRispondi alla seguente domanda sul "
1290
+ "in un massimo di 3 parole.\n\nDomanda: {question}",
1291
+ num_few_shot_examples=4,
1292
+ max_generated_tokens=32,
1293
+ )
1294
+
1142
1295
  ICELANDIC_QA_CONFIG = DatasetConfig(
1143
1296
  name="icelandic-qa",
1144
- pretty_name="the Icelandic reading comprehension dataset about Icelandic culture "
1145
- "and history",
1297
+ pretty_name="the Icelandic reading comprehension dataset IcelandicQA",
1146
1298
  huggingface_id="EuroEval/icelandic-qa",
1147
1299
  task=RC,
1148
1300
  languages=[IS],
@@ -1352,6 +1504,20 @@ ORANGE_SUM_CONFIG = DatasetConfig(
1352
1504
  max_generated_tokens=256,
1353
1505
  )
1354
1506
 
1507
+ ILPOST_SUM_CONFIG = DatasetConfig(
1508
+ name="ilpost-sum",
1509
+ pretty_name="the truncated version of the Italian summarisation dataset IlPost",
1510
+ huggingface_id="EuroEval/ilpost-sum",
1511
+ task=SUMM,
1512
+ languages=[IT],
1513
+ prompt_prefix="Di seguito sono riportati gli articoli con i relativi riassunti.",
1514
+ prompt_template="Articolo di cronaca: {text}\nSintesi: {target_text}",
1515
+ instruction_prompt="Articolo di cronaca: {text}\n\nScrivete un riassunto "
1516
+ "dell'articolo sopra citato.",
1517
+ num_few_shot_examples=1,
1518
+ max_generated_tokens=256,
1519
+ )
1520
+
1355
1521
  # TODO: Faroese summarization
1356
1522
 
1357
1523
 
@@ -1377,7 +1543,7 @@ DANSKE_TALEMAADER_CONFIG = DatasetConfig(
1377
1543
  DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
1378
1544
  name="danish-citizen-tests",
1379
1545
  pretty_name="the Danish knowledge dataset Danish Citizen Tests",
1380
- huggingface_id="EuroEval/danish-citizen-tests",
1546
+ huggingface_id="EuroEval/danish-citizen-tests-updated",
1381
1547
  task=KNOW,
1382
1548
  languages=[DA],
1383
1549
  labels=["a", "b", "c", "d"],
@@ -1390,6 +1556,22 @@ DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
1390
1556
  max_generated_tokens=5,
1391
1557
  )
1392
1558
 
1559
+ NRK_QUIZ_QA_CONFIG = DatasetConfig(
1560
+ name="nrk-quiz-qa",
1561
+ pretty_name="the truncated version of the Norwegian knowledge dataset NRK Quiz QA",
1562
+ huggingface_id="EuroEval/nrk-quiz-qa-mini",
1563
+ task=KNOW,
1564
+ languages=[NB, NN, NO],
1565
+ labels=["a", "b", "c", "d"],
1566
+ prompt_prefix="Følgende er flervalgsspørsmål (med svar).",
1567
+ prompt_template="Spørsmål: {text}\nSvar: {label}",
1568
+ prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
1569
+ instruction_prompt="Spørsmål: {text}\n\nBesvar følgende spørsmål med 'a', 'b', "
1570
+ "'c' eller 'd', og ikke noe annet.",
1571
+ num_few_shot_examples=5,
1572
+ max_generated_tokens=5,
1573
+ )
1574
+
1393
1575
  MMLU_NO_CONFIG = DatasetConfig(
1394
1576
  name="mmlu-no",
1395
1577
  pretty_name="the truncated version of the Norwegian knowledge dataset MMLU-no, "
@@ -1405,6 +1587,7 @@ MMLU_NO_CONFIG = DatasetConfig(
1405
1587
  "'c' eller 'd', og ikke noe annet.",
1406
1588
  num_few_shot_examples=5,
1407
1589
  max_generated_tokens=5,
1590
+ unofficial=True,
1408
1591
  )
1409
1592
 
1410
1593
  MMLU_SV_CONFIG = DatasetConfig(
@@ -1444,7 +1627,8 @@ MMLU_IS_CONFIG = DatasetConfig(
1444
1627
 
1445
1628
  ICELANDIC_KNOWLEDGE_CONFIG = DatasetConfig(
1446
1629
  name="icelandic-knowledge",
1447
- pretty_name="the IcelandicQA dataset phrased as a knowledge dataset",
1630
+ pretty_name="the Icelandic knowledge dataset IcelandicKnowledge, derived from the "
1631
+ "IcelandicQA dataset",
1448
1632
  huggingface_id="EuroEval/icelandic-knowledge",
1449
1633
  task=KNOW,
1450
1634
  languages=[IS],
@@ -1456,7 +1640,6 @@ ICELANDIC_KNOWLEDGE_CONFIG = DatasetConfig(
1456
1640
  "'b', 'c' eða 'd'.",
1457
1641
  num_few_shot_examples=5,
1458
1642
  max_generated_tokens=5,
1459
- unofficial=True,
1460
1643
  )
1461
1644
 
1462
1645
  MMLU_DE_CONFIG = DatasetConfig(
@@ -1545,6 +1728,23 @@ MMLU_FR_CONFIG = DatasetConfig(
1545
1728
  max_generated_tokens=5,
1546
1729
  )
1547
1730
 
1731
+ MMLU_IT_CONFIG = DatasetConfig(
1732
+ name="mmlu-it",
1733
+ pretty_name="the truncated version of the Italian knowledge dataset MMLU-it, "
1734
+ "translated from the English MMLU dataset",
1735
+ huggingface_id="EuroEval/mmlu-it-mini",
1736
+ task=KNOW,
1737
+ languages=[IT],
1738
+ labels=["a", "b", "c", "d"],
1739
+ prompt_prefix="Le seguenti sono domande a scelta multipla (con relative risposte).",
1740
+ prompt_template="Domanda: {text}\nRéponse: {label}",
1741
+ prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
1742
+ instruction_prompt="Domanda: {text}\n\nRispondete alla domanda precedente con "
1743
+ "'a', 'b', 'c' o 'd' e nient'altro.",
1744
+ num_few_shot_examples=5,
1745
+ max_generated_tokens=5,
1746
+ )
1747
+
1548
1748
  ARC_DA_CONFIG = DatasetConfig(
1549
1749
  name="arc-da",
1550
1750
  pretty_name="the truncated version of the Danish knowledge dataset ARC-da, "
@@ -1614,6 +1814,7 @@ ARC_IS_CONFIG = DatasetConfig(
1614
1814
  "'b', 'c' eða 'd', og engu öðru.",
1615
1815
  num_few_shot_examples=5,
1616
1816
  max_generated_tokens=5,
1817
+ unofficial=True,
1617
1818
  )
1618
1819
 
1619
1820
  ARC_DE_CONFIG = DatasetConfig(
@@ -1691,6 +1892,23 @@ HELLASWAG_DA_CONFIG = DatasetConfig(
1691
1892
  max_generated_tokens=5,
1692
1893
  )
1693
1894
 
1895
+ NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
1896
+ name="nor-common-sense-qa",
1897
+ pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
1898
+ "NorCommonSenseQA",
1899
+ huggingface_id="EuroEval/nor-common-sense-qa",
1900
+ task=COMMON_SENSE,
1901
+ languages=[NB, NN, NO],
1902
+ labels=["a", "b", "c", "d", "e"],
1903
+ prompt_prefix="Følgende er flervalgsspørsmål (med svar).",
1904
+ prompt_template="Spørsmål: {text}\nSvar: {label}",
1905
+ prompt_label_mapping=dict(a="a", b="b", c="c", d="d", e="e"),
1906
+ instruction_prompt="Spørsmål: {text}\n\nBesvar følgende spørsmål med 'a', 'b', "
1907
+ "'c' eller 'd', og ikke noe annet.",
1908
+ num_few_shot_examples=5,
1909
+ max_generated_tokens=5,
1910
+ )
1911
+
1694
1912
  HELLASWAG_NO_CONFIG = DatasetConfig(
1695
1913
  name="hellaswag-no",
1696
1914
  pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
@@ -1706,6 +1924,7 @@ HELLASWAG_NO_CONFIG = DatasetConfig(
1706
1924
  "'c' eller 'd', og ikke noe annet.",
1707
1925
  num_few_shot_examples=5,
1708
1926
  max_generated_tokens=5,
1927
+ unofficial=True,
1709
1928
  )
1710
1929
 
1711
1930
  HELLASWAG_SV_CONFIG = DatasetConfig(
@@ -1829,6 +2048,23 @@ HELLASWAG_FR_CONFIG = DatasetConfig(
1829
2048
  max_generated_tokens=5,
1830
2049
  )
1831
2050
 
2051
+ HELLASWAG_IT_CONFIG = DatasetConfig(
2052
+ name="hellaswag-it",
2053
+ pretty_name="the truncated version of the Italian common-sense reasoning dataset "
2054
+ "HellaSwag-it, translated from the English HellaSwag dataset",
2055
+ huggingface_id="EuroEval/hellaswag-it-mini",
2056
+ task=COMMON_SENSE,
2057
+ languages=[IT],
2058
+ labels=["a", "b", "c", "d"],
2059
+ prompt_prefix="Le seguenti sono domande a scelta multipla (con relative risposte).",
2060
+ prompt_template="Domanda: {text}\nRéponse: {label}",
2061
+ prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
2062
+ instruction_prompt="Domanda: {text}\n\nRispondete alla domanda precedente con "
2063
+ "'a', 'b', 'c' o 'd' e nient'altro.",
2064
+ num_few_shot_examples=5,
2065
+ max_generated_tokens=5,
2066
+ )
2067
+
1832
2068
  # TODO: Faroese common sense reasoning
1833
2069
 
1834
2070
 
@@ -8,7 +8,7 @@ from collections import defaultdict
8
8
  import evaluate
9
9
  import numpy as np
10
10
  from evaluate import EvaluationModule
11
- from transformers import PreTrainedTokenizer
11
+ from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
12
12
  from transformers.trainer import Trainer
13
13
 
14
14
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
@@ -21,12 +21,8 @@ if t.TYPE_CHECKING:
21
21
  import torch.nn as nn
22
22
  from datasets.arrow_dataset import Dataset
23
23
  from transformers import (
24
- BaseImageProcessor,
25
24
  EvalPrediction,
26
- FeatureExtractionMixin,
27
25
  PreTrainedModel,
28
- PreTrainedTokenizerBase,
29
- ProcessorMixin,
30
26
  TrainerCallback,
31
27
  TrainingArguments,
32
28
  )
@@ -65,7 +61,7 @@ class QuestionAnsweringTrainer(Trainer):
65
61
 
66
62
  # Get the CLS token id for the tokenizer
67
63
  if self.tokenizer is not None:
68
- assert isinstance(self.tokenizer, PreTrainedTokenizer)
64
+ assert isinstance(self.tokenizer, PreTrainedTokenizerBase)
69
65
  special_token_metadata = get_special_token_metadata(self.tokenizer)
70
66
  self.cls_token_id = special_token_metadata["cls_token_id"]
71
67
 
@@ -147,7 +143,7 @@ class QuestionAnsweringTrainer(Trainer):
147
143
 
148
144
 
149
145
  def compute_metrics(
150
- model_outputs_and_labels: tuple["Predictions", "Labels"],
146
+ model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
151
147
  dataset_config: "DatasetConfig",
152
148
  benchmark_config: "BenchmarkConfig",
153
149
  ) -> dict[str, float]:
@@ -167,6 +163,13 @@ def compute_metrics(
167
163
  values.
168
164
  """
169
165
  model_outputs, labels = model_outputs_and_labels
166
+
167
+ # If the model outputs is a pair, then the first element corresponds to the model
168
+ # predictions
169
+ if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
170
+ model_outputs = model_outputs[0]
171
+
172
+ assert not isinstance(model_outputs, tuple)
170
173
  raise_if_model_output_contains_nan_values(model_output=model_outputs)
171
174
 
172
175
  metrics = {
@@ -13,6 +13,8 @@ from ..data_models import BenchmarkConfig, GenerativeModelOutput
13
13
  from ..utils import log_once, raise_if_model_output_contains_nan_values
14
14
 
15
15
  if t.TYPE_CHECKING:
16
+ from transformers import EvalPrediction
17
+
16
18
  from ..data_models import DatasetConfig
17
19
  from ..types import Labels, Predictions
18
20
 
@@ -21,7 +23,7 @@ logger = logging.getLogger("euroeval")
21
23
 
22
24
 
23
25
  def compute_metrics(
24
- model_outputs_and_labels: tuple["Predictions", "Labels"],
26
+ model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
25
27
  dataset_config: "DatasetConfig",
26
28
  benchmark_config: "BenchmarkConfig",
27
29
  ) -> dict[str, float]:
@@ -42,7 +44,11 @@ def compute_metrics(
42
44
  """
43
45
  model_outputs, labels = model_outputs_and_labels
44
46
  label2id = {label: idx for idx, label in dataset_config.id2label.items()}
45
- raise_if_model_output_contains_nan_values(model_output=model_outputs)
47
+
48
+ # If the model outputs is a pair, then the first element corresponds to the model
49
+ # predictions
50
+ if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
51
+ model_outputs = model_outputs[0]
46
52
 
47
53
  metrics = {
48
54
  metric_cfg.name: (
@@ -61,6 +67,9 @@ def compute_metrics(
61
67
  else:
62
68
  predictions = model_outputs
63
69
 
70
+ assert not isinstance(model_outputs, tuple)
71
+ raise_if_model_output_contains_nan_values(model_output=model_outputs)
72
+
64
73
  prompt_label_to_label_mapping = {
65
74
  prompt_label: label
66
75
  for label, prompt_label in dataset_config.prompt_label_mapping.items()
@@ -17,6 +17,8 @@ from ..utils import (
17
17
  )
18
18
 
19
19
  if t.TYPE_CHECKING:
20
+ from transformers import EvalPrediction
21
+
20
22
  from ..types import Labels, Predictions
21
23
 
22
24
 
@@ -24,7 +26,7 @@ logger = logging.getLogger("euroeval")
24
26
 
25
27
 
26
28
  def compute_metrics(
27
- model_outputs_and_labels: tuple["Predictions", "Labels"],
29
+ model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
28
30
  dataset_config: "DatasetConfig",
29
31
  benchmark_config: "BenchmarkConfig",
30
32
  ) -> dict[str, float]:
@@ -44,6 +46,13 @@ def compute_metrics(
44
46
  values.
45
47
  """
46
48
  model_outputs, labels = model_outputs_and_labels
49
+
50
+ # If the model outputs is a pair, then the first element corresponds to the model
51
+ # predictions
52
+ if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
53
+ model_outputs = model_outputs[0]
54
+
55
+ assert not isinstance(model_outputs, tuple)
47
56
  raise_if_model_output_contains_nan_values(model_output=model_outputs)
48
57
 
49
58
  metrics = {
@@ -16,7 +16,7 @@ from ..exceptions import InvalidBenchmark, NeedsExtraInstalled
16
16
  from ..utils import raise_if_model_output_contains_nan_values
17
17
 
18
18
  if t.TYPE_CHECKING:
19
- from transformers import BatchEncoding
19
+ from transformers import BatchEncoding, EvalPrediction
20
20
 
21
21
  from ..types import Labels, Predictions
22
22
 
@@ -28,7 +28,7 @@ logger = logging.getLogger("euroeval")
28
28
 
29
29
 
30
30
  def compute_metrics(
31
- model_outputs_and_labels: tuple["Predictions", "Labels"],
31
+ model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
32
32
  has_misc_tags: bool,
33
33
  dataset_config: "DatasetConfig",
34
34
  benchmark_config: "BenchmarkConfig",
@@ -51,7 +51,11 @@ def compute_metrics(
51
51
  values.
52
52
  """
53
53
  model_outputs, labels = model_outputs_and_labels
54
- raise_if_model_output_contains_nan_values(model_output=model_outputs)
54
+
55
+ # If the model outputs is a pair, then the first element corresponds to the model
56
+ # predictions
57
+ if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
58
+ model_outputs = model_outputs[0]
55
59
 
56
60
  metrics = {
57
61
  metric_cfg.name: (
@@ -93,6 +97,8 @@ def compute_metrics(
93
97
  else:
94
98
  predictions = model_outputs # type: ignore[assignment]
95
99
 
100
+ raise_if_model_output_contains_nan_values(model_output=predictions)
101
+
96
102
  # Replace predicted tag with either MISC or O tags if they are not part of the
97
103
  # dataset
98
104
  labels_without_misc = {
euroeval/utils.py CHANGED
@@ -21,7 +21,7 @@ import requests
21
21
  import torch
22
22
  from datasets.utils import disable_progress_bar
23
23
  from requests.exceptions import RequestException
24
- from transformers import PreTrainedTokenizer
24
+ from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
25
25
  from transformers import logging as tf_logging
26
26
 
27
27
  from .exceptions import InvalidModel, NaNValueInModelOutput
@@ -231,7 +231,7 @@ def internet_connection_available() -> bool:
231
231
  return False
232
232
 
233
233
 
234
- def get_special_token_metadata(tokenizer: "PreTrainedTokenizer") -> dict:
234
+ def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
235
235
  """Get the special token metadata for a tokenizer.
236
236
 
237
237
  Args:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.2.0
3
+ Version: 15.3.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -49,6 +49,7 @@ Requires-Dist: sacremoses>=0.1.1
49
49
  Requires-Dist: scikit-learn<1.6.0
50
50
  Requires-Dist: sentencepiece>=0.1.96
51
51
  Requires-Dist: seqeval>=1.2.2
52
+ Requires-Dist: setuptools>=75.8.2
52
53
  Requires-Dist: tenacity>=9.0.0
53
54
  Requires-Dist: termcolor>=2.0.0
54
55
  Requires-Dist: torch>=2.3.0
@@ -76,6 +77,8 @@ Description-Content-Type: text/markdown
76
77
 
77
78
  ### The robust European language model benchmark.
78
79
 
80
+ _(formerly known as ScandEval)_
81
+
79
82
  ______________________________________________________________________
80
83
  [![Documentation](https://img.shields.io/badge/docs-passing-green)](https://euroeval.com)
81
84
  [![PyPI Status](https://badge.fury.io/py/euroeval.svg)](https://pypi.org/project/euroeval/)
@@ -6,7 +6,7 @@ euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
6
  euroeval/constants.py,sha256=qFrm3cRT6UlnTXfHUmxqZsr0SBsGskjV1qrUlnAW-aw,1473
7
7
  euroeval/data_loading.py,sha256=IHd1H4OCAtOyiro7YnJsGbbT7PTwiMUB02gh1g6Nlhg,3116
8
8
  euroeval/data_models.py,sha256=4ZY9x2pINlRywTzYxxtrYG7qXMNdod5I9XBOlTJYT8E,14495
9
- euroeval/dataset_configs.py,sha256=2t0S6MqLjVLH1T7qQCpkPkAAev2KBZVAlqWVJ-K53ls,75351
9
+ euroeval/dataset_configs.py,sha256=Cj3McxA0JTC7RKzXofzpJfmIhoXAfF756f_1SZUaPlw,84391
10
10
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
11
11
  euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
12
12
  euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
@@ -20,21 +20,21 @@ euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
20
20
  euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
21
21
  euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
22
22
  euroeval/types.py,sha256=xvBn0eNynqAqwL7CGEgVFb_lCD9SdHUMvxJo7OXRfls,2367
23
- euroeval/utils.py,sha256=lbiLcVPVPkvp7lLHUJqhAb6X0y8S_sqSrzXAqmfzFe0,18707
23
+ euroeval/utils.py,sha256=K4z2IQilLJo6Cf8bzM46PYTaylDv6bYi7FRbHTbZulE,18736
24
24
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
25
  euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
26
- euroeval/benchmark_modules/fresh.py,sha256=3R2k3Vp7J4YY8Nw5osbDIyayPtLLa2mItJGJFyyYNkY,9599
26
+ euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
27
27
  euroeval/benchmark_modules/hf.py,sha256=n3VIUA7XOOTgbSMkmYp5S06iJV0kp7aMq8YzRb0EDLw,41741
28
28
  euroeval/benchmark_modules/litellm.py,sha256=uMPzUjTU54UHDmBImzWUFCGUupKvZNQN-2u0c8UaM3s,34488
29
- euroeval/benchmark_modules/vllm.py,sha256=enLKALixXvz2qvfblGEfRwU7wb-X-7HkOdjcYpdA3xM,43341
29
+ euroeval/benchmark_modules/vllm.py,sha256=cw7onFYXQ66cr2c4WTB90VYtQYc47lkwz6A25FW8sBs,43444
30
30
  euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
31
31
  euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
32
- euroeval/task_utils/question_answering.py,sha256=NYl3g7r84e9uaEObj_-fTFKof-WkkCQ_H_VSJ3UDS1M,27112
33
- euroeval/task_utils/sequence_classification.py,sha256=JyGLIfMvF98emmnsfckomdzJWluVj1EeAzSLZmJFpOk,8203
34
- euroeval/task_utils/text_to_text.py,sha256=-9iz5nR9Ib-9xOolDQM0-QJ7k4iSjDP3togE1wgxsDw,5374
35
- euroeval/task_utils/token_classification.py,sha256=7BSBTBL7GBYOJQlK4se3h6C6HdjMec1gGgquJNXYlaI,17738
36
- euroeval-15.2.0.dist-info/METADATA,sha256=C3bNw5fBxAFG_aOLRg6tqXsL-cb4uRoq0qsTBmRmf50,10196
37
- euroeval-15.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
- euroeval-15.2.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
- euroeval-15.2.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
- euroeval-15.2.0.dist-info/RECORD,,
32
+ euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
33
+ euroeval/task_utils/sequence_classification.py,sha256=FrkvFzxFSnZoXThgpQqvJCIy3_YemyqZFQ1L-YdMMiw,8527
34
+ euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
35
+ euroeval/task_utils/token_classification.py,sha256=yT1YvZzmqNaVSRZ67BvyURhlkgTm3ltWPft4HxodZAE,17983
36
+ euroeval-15.3.0.dist-info/METADATA,sha256=Mlz6DcLg2H3aWoCXngQZNdFMrJmUFpAdD0FD0wsBKHw,10263
37
+ euroeval-15.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
+ euroeval-15.3.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
+ euroeval-15.3.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
+ euroeval-15.3.0.dist-info/RECORD,,