PyPI - EuroEval - Versions diffs - 15.2.0__py3-none-any.whl → 15.3.1__py3-none-any.whl - Mend

EuroEval 15.2.0py3-none-any.whl → 15.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (15) hide show

euroeval/benchmark_modules/fresh.py +3 -1
euroeval/benchmark_modules/vllm.py +6 -2
euroeval/benchmarker.py +10 -12
euroeval/data_loading.py +9 -3
euroeval/dataset_configs.py +242 -6
euroeval/task_utils/question_answering.py +10 -7
euroeval/task_utils/sequence_classification.py +11 -2
euroeval/task_utils/text_to_text.py +10 -1
euroeval/task_utils/token_classification.py +9 -3
euroeval/utils.py +2 -2
{euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/METADATA +4 -1
{euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/RECORD +15 -15
{euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/WHEEL +0 -0
{euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/entry_points.txt +0 -0
{euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -221,7 +221,9 @@ def load_model_and_tokenizer(
     match dataset_config.task.task_group:
         case (
-            TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
+            TaskGroup.SEQUENCE_CLASSIFICATION
+            | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
+            | TaskGroup.SPEED
         ):
             model_cls_mapping = dict(
                 fresh_xlm_roberta_base=XLMRobertaForSequenceClassification,

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -1151,7 +1151,7 @@ def get_end_of_reasoning_token_id(
     ):
         log_once(
             message=(
-                f"Detected reasoning token {reasoning_token!r} and end of reasoning "
+                f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
                 f"token {end_of_reasoning_token!r}, but one of them is not registered "
                 "as a special token, so assuming it is not a real reasoning token."
             ),
@@ -1160,7 +1160,11 @@ def get_end_of_reasoning_token_id(
         return None
     log_once(
-        message=f"Detected reasoning token {reasoning_token!r}.", level=logging.DEBUG
+        message=(
+            f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
+            f"token {end_of_reasoning_token!r}."
+        ),
+        level=logging.DEBUG,
     )
     # Encode the end of reasoning token and return its ID

euroeval/benchmarker.py CHANGED Viewed

@@ -18,7 +18,7 @@ from .data_loading import load_data
 from .data_models import BenchmarkConfigParams, BenchmarkResult
 from .dataset_configs import get_all_dataset_configs
 from .enums import Device, ModelType
-from .exceptions import InvalidBenchmark, InvalidModel
+from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
 from .finetuning import finetune
 from .generation import generate
 from .model_config import get_model_config
@@ -769,23 +769,21 @@ class Benchmarker:
                 logger.debug(f"Results:\n{results}")
                 return record
+            except HuggingFaceHubDown:
+                wait_time = 30
+                logger.debug(
+                    f"The Hugging Face Hub seems to be down. Retrying in {wait_time} "
+                    "seconds."
+                )
+                sleep(wait_time)
+                continue
             except (InvalidBenchmark, InvalidModel) as e:
                 # If the model ID is not valid then raise an error
                 model_err_msg = "does not exist on the Hugging Face Hub"
                 if benchmark_config.raise_errors and model_err_msg in str(e):
                     raise e
-                # Otherwise, if the error is due to Hugging Face Hub being down, then
-                # wait a bit and try again
-                elif "The Hugging Face Hub seems to be down." in str(e):
-                    wait_time = 30
-                    logger.debug(
-                        "The Hugging Face Hub seems to be down. Retrying in "
-                        f"{wait_time} seconds."
-                    )
-                    sleep(wait_time)
-                    continue
                 # Otherwise, if the error is due to the MPS fallback not being enabled,
                 # then raise an error asking the user to enable it
                 elif "PYTORCH_ENABLE_MPS_FALLBACK" in str(e):

euroeval/data_loading.py CHANGED Viewed

@@ -10,7 +10,7 @@ from huggingface_hub.errors import HfHubHTTPError
 from numpy.random import Generator
 from .data_models import BenchmarkConfig, DatasetConfig
-from .exceptions import InvalidBenchmark
+from .exceptions import HuggingFaceHubDown, InvalidBenchmark
 from .utils import unscramble
 logger = logging.getLogger("euroeval")
@@ -31,6 +31,12 @@ def load_data(
     Returns:
         A list of bootstrapped datasets, one for each iteration.
+    Raises:
+        InvalidBenchmark:
+            If the dataset cannot be loaded.
+        HuggingFaceHubDown:
+            If the Hugging Face Hub is down.
     """
     num_attempts = 5
     for _ in range(num_attempts):
@@ -41,14 +47,14 @@ def load_data(
                 token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
             )
             break
-        except (FileNotFoundError, DatasetsError):
+        except (FileNotFoundError, DatasetsError, ConnectionError):
             logger.warning(
                 f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
             )
             time.sleep(1)
             continue
         except HfHubHTTPError:
-            raise InvalidBenchmark("The Hugging Face Hub seems to be down.")
+            raise HuggingFaceHubDown()
     else:
         raise InvalidBenchmark(
             f"Failed to load dataset {dataset_config.huggingface_id!r} after "

euroeval/dataset_configs.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """All dataset configurations used in EuroEval."""
 from .data_models import DatasetConfig
-from .languages import DA, DE, EN, FO, FR, IS, NB, NL, NN, NO, SV, get_all_languages
+from .languages import DA, DE, EN, FO, FR, IS, IT, NB, NL, NN, NO, SV, get_all_languages
 from .tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SPEED, SUMM
@@ -244,6 +244,26 @@ ALLOCINE_CONFIG = DatasetConfig(
     max_generated_tokens=5,
 )
+SENTIPOLC_CONFIG = DatasetConfig(
+    name="sentipolc16",
+    pretty_name="the truncated version of the Italian sentiment classification "
+    "dataset Sentipolc-16",
+    huggingface_id="EuroEval/sentipolc16-mini",
+    task=SENT,
+    languages=[IT],
+    labels=["negative", "neutral", "positive"],
+    prompt_prefix="Di seguito sono riportati i testi e il loro sentimento, che può "
+    "essere 'positivo', 'neutro' o 'negativo'.",
+    prompt_template="Tweet: {text}\nSentimento: {label}",
+    prompt_label_mapping=dict(
+        positive="positivo", neutral="neutro", negative="negativo"
+    ),
+    instruction_prompt="Tweet: {text}\n\nClassificare il sentimento nel Tweet. "
+    "Rispondete con 'positivo', 'neutro' o 'negativo', e nient'altro.",
+    num_few_shot_examples=12,
+    max_generated_tokens=5,
+)
 ### NAMED ENTITY RECOGNITION DATASETS ###
@@ -718,6 +738,85 @@ WIKIANN_FO_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WIKINEURAL_IT_CONFIG = DatasetConfig(
+    name="wikineural-it",
+    pretty_name="the truncated version of the Italian named "
+    "entity recognition dataset WikiNEuRal IT",
+    huggingface_id="EuroEval/wikineural-mini-it",
+    task=NER,
+    languages=[IT],
+    labels=[
+        "o",
+        "b-loc",
+        "i-loc",
+        "b-org",
+        "i-org",
+        "b-per",
+        "i-per",
+        "b-misc",
+        "i-misc",
+    ],
+    prompt_prefix="Di seguito sono riportate le frasi e i dizionari JSON con le entità "
+    "denominate presenti nella frase data.",
+    prompt_template="Frase: {text}\nEntità denominate: {label}",
+    prompt_label_mapping={
+        "b-per": "persona",
+        "i-per": "persona",
+        "b-loc": "posizione",
+        "i-loc": "posizione",
+        "b-org": "organizzazione",
+        "i-org": "organizzazione",
+        "b-misc": "varie",
+        "i-misc": "varie",
+    },
+    instruction_prompt="Frase: {text}\n\nIdentificare le entità nominate nella frase. "
+    "Il risultato dovrebbe essere un dizionario JSON con le chiavi 'persona', "
+    "'posizione', 'organizzazione' e 'varie'. I valori devono essere elenchi di entità "
+    "nominate di quel tipo, esattamente come appaiono nella frase.",
+    num_few_shot_examples=8,
+    max_generated_tokens=128,
+    unofficial=True,
+)
+MULTINERD_IT_CONFIG = DatasetConfig(
+    name="multinerd-it",
+    pretty_name="the truncated version of the Italian part of the named "
+    "entity recognition dataset MultiNERD",
+    huggingface_id="EuroEval/multinerd-mini-it",
+    task=NER,
+    languages=[IT],
+    labels=[
+        "o",
+        "b-loc",
+        "i-loc",
+        "b-org",
+        "i-org",
+        "b-per",
+        "i-per",
+        "b-misc",
+        "i-misc",
+    ],
+    prompt_prefix="Di seguito sono riportate le frasi e i dizionari JSON con le entità "
+    "denominate presenti nella frase data.",
+    prompt_template="Frase: {text}\nEntità denominate: {label}",
+    prompt_label_mapping={
+        "b-per": "persona",
+        "i-per": "persona",
+        "b-loc": "posizione",
+        "i-loc": "posizione",
+        "b-org": "organizzazione",
+        "i-org": "organizzazione",
+        "b-misc": "varie",
+        "i-misc": "varie",
+    },
+    instruction_prompt="Frase: {text}\n\nIdentificare le entità nominate nella frase. "
+    "Il risultato dovrebbe essere un dizionario JSON con le chiavi 'persona', "
+    "'posizione', 'organizzazione' e 'varie'. I valori devono essere elenchi di entità "
+    "nominate di quel tipo, esattamente come appaiono nella frase.",
+    num_few_shot_examples=8,
+    max_generated_tokens=128,
+)
 ### LINGUISTIC ACCEPTABILITY DATASETS ###
@@ -789,6 +888,25 @@ SCALA_NN_CONFIG = DatasetConfig(
     max_generated_tokens=5,
 )
+NO_COLA_CONFIG = DatasetConfig(
+    name="no-cola",
+    pretty_name="the truncated version of the Norwegian linguistic acceptability "
+    "dataset NoCoLA",
+    huggingface_id="EuroEval/no-cola-mini",
+    task=LA,
+    languages=[NB, NO],
+    labels=["incorrect", "correct"],
+    prompt_prefix="Følgende er setninger og hvorvidt de er grammatisk korrekte.",
+    prompt_template="Setning: {text}\nGrammatisk korrekt: {label}",
+    instruction_prompt="Setning: {text}\n\nBestem om setningen er grammatisk korrekt "
+    "eller ikke. Svar med 'ja' hvis setningen er korrekt og 'nei' hvis den ikke er, "
+    "og ikke noe annet.",
+    prompt_label_mapping=dict(correct="ja", incorrect="nei"),
+    num_few_shot_examples=12,
+    max_generated_tokens=5,
+    unofficial=True,
+)
 SCALA_IS_CONFIG = DatasetConfig(
     name="scala-is",
     pretty_name="the Icelandic part of the linguistic acceptability dataset ScaLA",
@@ -893,6 +1011,24 @@ SCALA_FR_CONFIG = DatasetConfig(
     max_generated_tokens=5,
 )
+SCALA_IT_CONFIG = DatasetConfig(
+    name="scala-it",
+    pretty_name="the Italian part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-it",
+    task=LA,
+    languages=[IT],
+    labels=["incorrect", "correct"],
+    prompt_prefix="Di seguito sono riportate le frasi e la loro correttezza "
+    "grammaticale.",
+    prompt_template="Frase : {text}\nGrammaticalmente corretto : {label}",
+    prompt_label_mapping=dict(correct="si", incorrect="no"),
+    instruction_prompt="Frase: {text}\n\nStabilite se la frase è grammaticalmente "
+    "corretta o meno. Rispondete con 'si' se la frase è corretta e con 'no' se "
+    "non lo è, e nient'altro.",
+    num_few_shot_examples=12,
+    max_generated_tokens=5,
+)
 DUTCH_COLA_CONFIG = DatasetConfig(
     name="dutch-cola",
     pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
@@ -1139,10 +1275,26 @@ SQUAD_NL_CONFIG = DatasetConfig(
     max_generated_tokens=32,
 )
+SQUAD_IT_CONFIG = DatasetConfig(
+    name="squad-it",
+    pretty_name="the truncated version of the Italian reading comprehension dataset "
+    "SQuAD-it, translated from the English SQuAD dataset",
+    huggingface_id="EuroEval/squad-it-mini",
+    task=RC,
+    languages=[IT],
+    labels=["start_positions", "end_positions"],
+    prompt_prefix="I testi che seguono sono accompagnati da domande e risposte.",
+    prompt_template="Testo: {text}\nDomanda: {question}\nRispondere in massimo "
+    "3 parole: {label}",
+    instruction_prompt="Testo: {text}\n\nRispondi alla seguente domanda sul "
+    "in un massimo di 3 parole.\n\nDomanda: {question}",
+    num_few_shot_examples=4,
+    max_generated_tokens=32,
+)
 ICELANDIC_QA_CONFIG = DatasetConfig(
     name="icelandic-qa",
-    pretty_name="the Icelandic reading comprehension dataset about Icelandic culture "
-    "and history",
+    pretty_name="the Icelandic reading comprehension dataset IcelandicQA",
     huggingface_id="EuroEval/icelandic-qa",
     task=RC,
     languages=[IS],
@@ -1352,6 +1504,20 @@ ORANGE_SUM_CONFIG = DatasetConfig(
     max_generated_tokens=256,
 )
+ILPOST_SUM_CONFIG = DatasetConfig(
+    name="ilpost-sum",
+    pretty_name="the truncated version of the Italian summarisation dataset IlPost",
+    huggingface_id="EuroEval/ilpost-sum",
+    task=SUMM,
+    languages=[IT],
+    prompt_prefix="Di seguito sono riportati gli articoli con i relativi riassunti.",
+    prompt_template="Articolo di cronaca: {text}\nSintesi: {target_text}",
+    instruction_prompt="Articolo di cronaca: {text}\n\nScrivete un riassunto "
+    "dell'articolo sopra citato.",
+    num_few_shot_examples=1,
+    max_generated_tokens=256,
+)
 # TODO: Faroese summarization
@@ -1377,7 +1543,7 @@ DANSKE_TALEMAADER_CONFIG = DatasetConfig(
 DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
     name="danish-citizen-tests",
     pretty_name="the Danish knowledge dataset Danish Citizen Tests",
-    huggingface_id="EuroEval/danish-citizen-tests",
+    huggingface_id="EuroEval/danish-citizen-tests-updated",
     task=KNOW,
     languages=[DA],
     labels=["a", "b", "c", "d"],
@@ -1390,6 +1556,22 @@ DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
     max_generated_tokens=5,
 )
+NRK_QUIZ_QA_CONFIG = DatasetConfig(
+    name="nrk-quiz-qa",
+    pretty_name="the truncated version of the Norwegian knowledge dataset NRK Quiz QA",
+    huggingface_id="EuroEval/nrk-quiz-qa-mini",
+    task=KNOW,
+    languages=[NB, NN, NO],
+    labels=["a", "b", "c", "d"],
+    prompt_prefix="Følgende er flervalgsspørsmål (med svar).",
+    prompt_template="Spørsmål: {text}\nSvar: {label}",
+    prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
+    instruction_prompt="Spørsmål: {text}\n\nBesvar følgende spørsmål med 'a', 'b', "
+    "'c' eller 'd', og ikke noe annet.",
+    num_few_shot_examples=5,
+    max_generated_tokens=5,
+)
 MMLU_NO_CONFIG = DatasetConfig(
     name="mmlu-no",
     pretty_name="the truncated version of the Norwegian knowledge dataset MMLU-no, "
@@ -1405,6 +1587,7 @@ MMLU_NO_CONFIG = DatasetConfig(
     "'c' eller 'd', og ikke noe annet.",
     num_few_shot_examples=5,
     max_generated_tokens=5,
+    unofficial=True,
 )
 MMLU_SV_CONFIG = DatasetConfig(
@@ -1444,7 +1627,8 @@ MMLU_IS_CONFIG = DatasetConfig(
 ICELANDIC_KNOWLEDGE_CONFIG = DatasetConfig(
     name="icelandic-knowledge",
-    pretty_name="the IcelandicQA dataset phrased as a knowledge dataset",
+    pretty_name="the Icelandic knowledge dataset IcelandicKnowledge, derived from the "
+    "IcelandicQA dataset",
     huggingface_id="EuroEval/icelandic-knowledge",
     task=KNOW,
     languages=[IS],
@@ -1456,7 +1640,6 @@ ICELANDIC_KNOWLEDGE_CONFIG = DatasetConfig(
     "'b', 'c' eða 'd'.",
     num_few_shot_examples=5,
     max_generated_tokens=5,
-    unofficial=True,
 )
 MMLU_DE_CONFIG = DatasetConfig(
@@ -1545,6 +1728,23 @@ MMLU_FR_CONFIG = DatasetConfig(
     max_generated_tokens=5,
 )
+MMLU_IT_CONFIG = DatasetConfig(
+    name="mmlu-it",
+    pretty_name="the truncated version of the Italian knowledge dataset MMLU-it, "
+    "translated from the English MMLU dataset",
+    huggingface_id="EuroEval/mmlu-it-mini",
+    task=KNOW,
+    languages=[IT],
+    labels=["a", "b", "c", "d"],
+    prompt_prefix="Le seguenti sono domande a scelta multipla (con relative risposte).",
+    prompt_template="Domanda: {text}\nRéponse: {label}",
+    prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
+    instruction_prompt="Domanda: {text}\n\nRispondete alla domanda precedente con "
+    "'a', 'b', 'c' o 'd' e nient'altro.",
+    num_few_shot_examples=5,
+    max_generated_tokens=5,
+)
 ARC_DA_CONFIG = DatasetConfig(
     name="arc-da",
     pretty_name="the truncated version of the Danish knowledge dataset ARC-da, "
@@ -1614,6 +1814,7 @@ ARC_IS_CONFIG = DatasetConfig(
     "'b', 'c' eða 'd', og engu öðru.",
     num_few_shot_examples=5,
     max_generated_tokens=5,
+    unofficial=True,
 )
 ARC_DE_CONFIG = DatasetConfig(
@@ -1691,6 +1892,23 @@ HELLASWAG_DA_CONFIG = DatasetConfig(
     max_generated_tokens=5,
 )
+NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
+    name="nor-common-sense-qa",
+    pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
+    "NorCommonSenseQA",
+    huggingface_id="EuroEval/nor-common-sense-qa",
+    task=COMMON_SENSE,
+    languages=[NB, NN, NO],
+    labels=["a", "b", "c", "d", "e"],
+    prompt_prefix="Følgende er flervalgsspørsmål (med svar).",
+    prompt_template="Spørsmål: {text}\nSvar: {label}",
+    prompt_label_mapping=dict(a="a", b="b", c="c", d="d", e="e"),
+    instruction_prompt="Spørsmål: {text}\n\nBesvar følgende spørsmål med 'a', 'b', "
+    "'c' eller 'd', og ikke noe annet.",
+    num_few_shot_examples=5,
+    max_generated_tokens=5,
+)
 HELLASWAG_NO_CONFIG = DatasetConfig(
     name="hellaswag-no",
     pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
@@ -1706,6 +1924,7 @@ HELLASWAG_NO_CONFIG = DatasetConfig(
     "'c' eller 'd', og ikke noe annet.",
     num_few_shot_examples=5,
     max_generated_tokens=5,
+    unofficial=True,
 )
 HELLASWAG_SV_CONFIG = DatasetConfig(
@@ -1829,6 +2048,23 @@ HELLASWAG_FR_CONFIG = DatasetConfig(
     max_generated_tokens=5,
 )
+HELLASWAG_IT_CONFIG = DatasetConfig(
+    name="hellaswag-it",
+    pretty_name="the truncated version of the Italian common-sense reasoning dataset "
+    "HellaSwag-it, translated from the English HellaSwag dataset",
+    huggingface_id="EuroEval/hellaswag-it-mini",
+    task=COMMON_SENSE,
+    languages=[IT],
+    labels=["a", "b", "c", "d"],
+    prompt_prefix="Le seguenti sono domande a scelta multipla (con relative risposte).",
+    prompt_template="Domanda: {text}\nRéponse: {label}",
+    prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
+    instruction_prompt="Domanda: {text}\n\nRispondete alla domanda precedente con "
+    "'a', 'b', 'c' o 'd' e nient'altro.",
+    num_few_shot_examples=5,
+    max_generated_tokens=5,
+)
 # TODO: Faroese common sense reasoning

euroeval/task_utils/question_answering.py CHANGED Viewed

@@ -8,7 +8,7 @@ from collections import defaultdict
 import evaluate
 import numpy as np
 from evaluate import EvaluationModule
-from transformers import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
 from transformers.trainer import Trainer
 from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
@@ -21,12 +21,8 @@ if t.TYPE_CHECKING:
     import torch.nn as nn
     from datasets.arrow_dataset import Dataset
     from transformers import (
-        BaseImageProcessor,
         EvalPrediction,
-        FeatureExtractionMixin,
         PreTrainedModel,
-        PreTrainedTokenizerBase,
-        ProcessorMixin,
         TrainerCallback,
         TrainingArguments,
     )
@@ -65,7 +61,7 @@ class QuestionAnsweringTrainer(Trainer):
         # Get the CLS token id for the tokenizer
         if self.tokenizer is not None:
-            assert isinstance(self.tokenizer, PreTrainedTokenizer)
+            assert isinstance(self.tokenizer, PreTrainedTokenizerBase)
             special_token_metadata = get_special_token_metadata(self.tokenizer)
             self.cls_token_id = special_token_metadata["cls_token_id"]
@@ -147,7 +143,7 @@ class QuestionAnsweringTrainer(Trainer):
 def compute_metrics(
-    model_outputs_and_labels: tuple["Predictions", "Labels"],
+    model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
 ) -> dict[str, float]:
@@ -167,6 +163,13 @@ def compute_metrics(
         values.
     """
     model_outputs, labels = model_outputs_and_labels
+    # If the model outputs is a pair, then the first element corresponds to the model
+    # predictions
+    if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
+        model_outputs = model_outputs[0]
+    assert not isinstance(model_outputs, tuple)
     raise_if_model_output_contains_nan_values(model_output=model_outputs)
     metrics = {

euroeval/task_utils/sequence_classification.py CHANGED Viewed

@@ -13,6 +13,8 @@ from ..data_models import BenchmarkConfig, GenerativeModelOutput
 from ..utils import log_once, raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
+    from transformers import EvalPrediction
     from ..data_models import DatasetConfig
     from ..types import Labels, Predictions
@@ -21,7 +23,7 @@ logger = logging.getLogger("euroeval")
 def compute_metrics(
-    model_outputs_and_labels: tuple["Predictions", "Labels"],
+    model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
 ) -> dict[str, float]:
@@ -42,7 +44,11 @@ def compute_metrics(
     """
     model_outputs, labels = model_outputs_and_labels
     label2id = {label: idx for idx, label in dataset_config.id2label.items()}
-    raise_if_model_output_contains_nan_values(model_output=model_outputs)
+    # If the model outputs is a pair, then the first element corresponds to the model
+    # predictions
+    if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
+        model_outputs = model_outputs[0]
     metrics = {
         metric_cfg.name: (
@@ -61,6 +67,9 @@ def compute_metrics(
     else:
         predictions = model_outputs
+    assert not isinstance(model_outputs, tuple)
+    raise_if_model_output_contains_nan_values(model_output=model_outputs)
     prompt_label_to_label_mapping = {
         prompt_label: label
         for label, prompt_label in dataset_config.prompt_label_mapping.items()

euroeval/task_utils/text_to_text.py CHANGED Viewed

@@ -17,6 +17,8 @@ from ..utils import (
 )
 if t.TYPE_CHECKING:
+    from transformers import EvalPrediction
     from ..types import Labels, Predictions
@@ -24,7 +26,7 @@ logger = logging.getLogger("euroeval")
 def compute_metrics(
-    model_outputs_and_labels: tuple["Predictions", "Labels"],
+    model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
 ) -> dict[str, float]:
@@ -44,6 +46,13 @@ def compute_metrics(
         values.
     """
     model_outputs, labels = model_outputs_and_labels
+    # If the model outputs is a pair, then the first element corresponds to the model
+    # predictions
+    if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
+        model_outputs = model_outputs[0]
+    assert not isinstance(model_outputs, tuple)
     raise_if_model_output_contains_nan_values(model_output=model_outputs)
     metrics = {

euroeval/task_utils/token_classification.py CHANGED Viewed

@@ -16,7 +16,7 @@ from ..exceptions import InvalidBenchmark, NeedsExtraInstalled
 from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
-    from transformers import BatchEncoding
+    from transformers import BatchEncoding, EvalPrediction
     from ..types import Labels, Predictions
@@ -28,7 +28,7 @@ logger = logging.getLogger("euroeval")
 def compute_metrics(
-    model_outputs_and_labels: tuple["Predictions", "Labels"],
+    model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     has_misc_tags: bool,
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
@@ -51,7 +51,11 @@ def compute_metrics(
         values.
     """
     model_outputs, labels = model_outputs_and_labels
-    raise_if_model_output_contains_nan_values(model_output=model_outputs)
+    # If the model outputs is a pair, then the first element corresponds to the model
+    # predictions
+    if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
+        model_outputs = model_outputs[0]
     metrics = {
         metric_cfg.name: (
@@ -93,6 +97,8 @@ def compute_metrics(
     else:
         predictions = model_outputs  # type: ignore[assignment]
+    raise_if_model_output_contains_nan_values(model_output=predictions)
     # Replace predicted tag with either MISC or O tags if they are not part of the
     # dataset
     labels_without_misc = {

euroeval/utils.py CHANGED Viewed

@@ -21,7 +21,7 @@ import requests
 import torch
 from datasets.utils import disable_progress_bar
 from requests.exceptions import RequestException
-from transformers import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
 from transformers import logging as tf_logging
 from .exceptions import InvalidModel, NaNValueInModelOutput
@@ -231,7 +231,7 @@ def internet_connection_available() -> bool:
         return False
-def get_special_token_metadata(tokenizer: "PreTrainedTokenizer") -> dict:
+def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
     """Get the special token metadata for a tokenizer.
     Args:

{euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.2.0
+Version: 15.3.1
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -49,6 +49,7 @@ Requires-Dist: sacremoses>=0.1.1
 Requires-Dist: scikit-learn<1.6.0
 Requires-Dist: sentencepiece>=0.1.96
 Requires-Dist: seqeval>=1.2.2
+Requires-Dist: setuptools>=75.8.2
 Requires-Dist: tenacity>=9.0.0
 Requires-Dist: termcolor>=2.0.0
 Requires-Dist: torch>=2.3.0
@@ -76,6 +77,8 @@ Description-Content-Type: text/markdown
 ### The robust European language model benchmark.
+_(formerly known as ScandEval)_
 ______________________________________________________________________
 [![Documentation](https://img.shields.io/badge/docs-passing-green)](https://euroeval.com)
 [![PyPI Status](https://badge.fury.io/py/euroeval.svg)](https://pypi.org/project/euroeval/)

{euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
 euroeval/__init__.py,sha256=3od9_ucHlILSbe4WCR8k5PbeorvmUr-VjOKXJ01I0fA,2165
 euroeval/benchmark_config_factory.py,sha256=pi4Lu--ySKZRd9ItG6VKS6BPLis64vL-7UE99VSXq5Y,12534
-euroeval/benchmarker.py,sha256=EjORG5haUio9LgfGH7ruWEFutvJN0QGasoknFH_yGHs,46705
+euroeval/benchmarker.py,sha256=__DdnOvI9CNpgqPT1hsTl0GZFTyQ6KRfiQowCuh36sc,46534
 euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
 euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
 euroeval/constants.py,sha256=qFrm3cRT6UlnTXfHUmxqZsr0SBsGskjV1qrUlnAW-aw,1473
-euroeval/data_loading.py,sha256=IHd1H4OCAtOyiro7YnJsGbbT7PTwiMUB02gh1g6Nlhg,3116
+euroeval/data_loading.py,sha256=RoatBJMpGurP_y5O3KrEvly8Z_yYEapQnnMZ_tWWrlc,3272
 euroeval/data_models.py,sha256=4ZY9x2pINlRywTzYxxtrYG7qXMNdod5I9XBOlTJYT8E,14495
-euroeval/dataset_configs.py,sha256=2t0S6MqLjVLH1T7qQCpkPkAAev2KBZVAlqWVJ-K53ls,75351
+euroeval/dataset_configs.py,sha256=Cj3McxA0JTC7RKzXofzpJfmIhoXAfF756f_1SZUaPlw,84391
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
 euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
 euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
@@ -20,21 +20,21 @@ euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
 euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
 euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
 euroeval/types.py,sha256=xvBn0eNynqAqwL7CGEgVFb_lCD9SdHUMvxJo7OXRfls,2367
-euroeval/utils.py,sha256=lbiLcVPVPkvp7lLHUJqhAb6X0y8S_sqSrzXAqmfzFe0,18707
+euroeval/utils.py,sha256=K4z2IQilLJo6Cf8bzM46PYTaylDv6bYi7FRbHTbZulE,18736
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
-euroeval/benchmark_modules/fresh.py,sha256=3R2k3Vp7J4YY8Nw5osbDIyayPtLLa2mItJGJFyyYNkY,9599
+euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
 euroeval/benchmark_modules/hf.py,sha256=n3VIUA7XOOTgbSMkmYp5S06iJV0kp7aMq8YzRb0EDLw,41741
 euroeval/benchmark_modules/litellm.py,sha256=uMPzUjTU54UHDmBImzWUFCGUupKvZNQN-2u0c8UaM3s,34488
-euroeval/benchmark_modules/vllm.py,sha256=enLKALixXvz2qvfblGEfRwU7wb-X-7HkOdjcYpdA3xM,43341
+euroeval/benchmark_modules/vllm.py,sha256=cw7onFYXQ66cr2c4WTB90VYtQYc47lkwz6A25FW8sBs,43444
 euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
-euroeval/task_utils/question_answering.py,sha256=NYl3g7r84e9uaEObj_-fTFKof-WkkCQ_H_VSJ3UDS1M,27112
-euroeval/task_utils/sequence_classification.py,sha256=JyGLIfMvF98emmnsfckomdzJWluVj1EeAzSLZmJFpOk,8203
-euroeval/task_utils/text_to_text.py,sha256=-9iz5nR9Ib-9xOolDQM0-QJ7k4iSjDP3togE1wgxsDw,5374
-euroeval/task_utils/token_classification.py,sha256=7BSBTBL7GBYOJQlK4se3h6C6HdjMec1gGgquJNXYlaI,17738
-euroeval-15.2.0.dist-info/METADATA,sha256=C3bNw5fBxAFG_aOLRg6tqXsL-cb4uRoq0qsTBmRmf50,10196
-euroeval-15.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.2.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.2.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
-euroeval-15.2.0.dist-info/RECORD,,
+euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
+euroeval/task_utils/sequence_classification.py,sha256=FrkvFzxFSnZoXThgpQqvJCIy3_YemyqZFQ1L-YdMMiw,8527
+euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
+euroeval/task_utils/token_classification.py,sha256=yT1YvZzmqNaVSRZ67BvyURhlkgTm3ltWPft4HxodZAE,17983
+euroeval-15.3.1.dist-info/METADATA,sha256=elF7s_zt2tj9Hl1EMMDfNoMtskYK5Xh9i-N36vvzfQs,10263
+euroeval-15.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.3.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.3.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
+euroeval-15.3.1.dist-info/RECORD,,

{euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.2.0__py3-none-any.whl → 15.3.1__py3-none-any.whl

Potentially problematic release.

EuroEval 15.2.0py3-none-any.whl → 15.3.1py3-none-any.whl