PyPI - EuroEval - Versions diffs - 15.3.1__py3-none-any.whl → 15.4.0__py3-none-any.whl - Mend

EuroEval 15.3.1py3-none-any.whl → 15.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (16) hide show

euroeval/__init__.py +11 -0
euroeval/benchmark_config_factory.py +2 -2
euroeval/benchmark_modules/hf.py +2 -3
euroeval/benchmark_modules/litellm.py +124 -2
euroeval/benchmark_modules/vllm.py +33 -13
euroeval/benchmarker.py +2 -2
euroeval/constants.py +7 -1
euroeval/data_loading.py +2 -1
euroeval/dataset_configs.py +172 -1
euroeval/task_utils/token_classification.py +3 -9
euroeval/utils.py +1 -0
{euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/METADATA +22 -7
{euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/RECORD +16 -16
{euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/WHEEL +0 -0
{euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -14,6 +14,7 @@ warnings.filterwarnings("ignore", category=UserWarning)
 logging.getLogger("httpx").setLevel(logging.CRITICAL)
 logging.getLogger("datasets").setLevel(logging.CRITICAL)
 logging.getLogger("vllm").setLevel(logging.CRITICAL)
+logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
 # Set up logging
 fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
@@ -66,6 +67,16 @@ os.environ["OMP_NUM_THREADS"] = "1"
 os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
+# Avoid the "Cannot re-initialize CUDA in forked subprocess" error - see
+# https://github.com/vllm-project/vllm/issues/6152 for more
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+# Use older version v0 of vLLM, as the newer one requires XGrammar as decoding backend,
+# but XGrammar does not support having a maximal amount of elements in lists
+os.environ["VLLM_USE_V1"] = "0"
 # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
 # former and LiteLLM uses the latter
 if os.getenv("HUGGINGFACE_API_KEY"):

euroeval/benchmark_config_factory.py CHANGED Viewed

@@ -12,7 +12,7 @@ from .dataset_configs import get_all_dataset_configs
 from .enums import Device
 from .exceptions import InvalidBenchmark
 from .languages import get_all_languages
-from .tasks import get_all_tasks
+from .tasks import SPEED, get_all_tasks
 from .utils import log_once
 if t.TYPE_CHECKING:
@@ -294,7 +294,7 @@ def prepare_tasks_and_datasets(
     # Create the list of dataset tasks
     try:
         if task is None:
-            tasks = list(task_mapping.values())
+            tasks = [t for t in task_mapping.values() if t != SPEED]
         elif isinstance(task, str):
             tasks = [task_mapping[task]]
         else:

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -224,8 +224,6 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             "max_position_embeddings",
             "max_sequence_length",
             "model_max_length",
-            "sliding_window",
-            "sliding_window_size",
             "n_positions",
         ]
         for candidate_config_max_length in candidate_config_max_lengths:
@@ -804,7 +802,7 @@ def get_model_repo_info(
         generative_class_names = [
             class_name
             for tag in GENERATIVE_PIPELINE_TAGS
-            for class_name in TASK_MAPPING[tag].values()
+            for class_name in TASK_MAPPING.get(tag, dict()).values()
         ]
         if class_names is not None and any(
             class_name in generative_class_names for class_name in class_names
@@ -1023,6 +1021,7 @@ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedM
     """
     # Get the models' token type embedding children, if they exist
     children = get_children_of_module(name="model", module=model)
+    assert isinstance(children, dict)
     # If the model has token type embeddings then get them
     if children:

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -12,6 +12,7 @@ from functools import cached_property, partial
 from time import sleep
 import litellm
+import ollama
 from datasets import DatasetDict
 from huggingface_hub import HfApi
 from huggingface_hub.errors import (
@@ -31,6 +32,7 @@ from litellm.exceptions import (
 )
 from litellm.types.utils import ModelResponse
 from requests.exceptions import RequestException
+from tqdm.auto import tqdm
 from transformers import Trainer
 from ..constants import (
@@ -39,7 +41,13 @@ from ..constants import (
     TASK_GROUPS_USING_LOGPROBS,
     TASKS_USING_JSON,
 )
-from ..data_models import BenchmarkConfig, GenerativeModelOutput, ModelConfig, Task
+from ..data_models import (
+    BenchmarkConfig,
+    DatasetConfig,
+    GenerativeModelOutput,
+    ModelConfig,
+    Task,
+)
 from ..enums import (
     BatchingPreference,
     GenerativeType,
@@ -49,6 +57,7 @@ from ..enums import (
 )
 from ..exceptions import (
     InvalidBenchmark,
+    InvalidModel,
     NeedsAdditionalArgument,
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
@@ -60,7 +69,7 @@ from ..task_utils import (
     token_classification,
 )
 from ..types import ExtractLabelsFunction
-from ..utils import create_model_cache_dir
+from ..utils import create_model_cache_dir, log_once
 from .base import BenchmarkModule
 from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
@@ -136,6 +145,34 @@ class LiteLLMModel(BenchmarkModule):
     batching_preference = BatchingPreference.SINGLE_SAMPLE
     high_priority = False
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        dataset_config: DatasetConfig,
+        benchmark_config: BenchmarkConfig,
+    ) -> None:
+        """Initialise the model.
+        Args:
+            model_config:
+                The model configuration.
+            dataset_config:
+                The dataset configuration.
+            benchmark_config:
+                The benchmark configuration.
+        """
+        # Detect whether the model is an Ollama model, as we need to extract metadata
+        # differently for these models
+        self.is_ollama = model_config.model_id.startswith(
+            "ollama/"
+        ) or model_config.model_id.startswith("ollama_chat/")
+        super().__init__(
+            model_config=model_config,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
+        )
     @property
     def generative_type(self) -> GenerativeType | None:
         """Get the generative type of the model.
@@ -269,10 +306,24 @@ class LiteLLMModel(BenchmarkModule):
         Returns:
             The number of parameters in the model.
         """
+        # Start by trying out the regex mapping, and use the value if it matches
         for key, value in NUM_PARAMS_MAPPING.items():
             if re.fullmatch(pattern=key, string=self.model_config.model_id) is not None:
                 return value
+        # If it is an Ollama model then we can get the number of parameters from the
+        # Ollama Python SDK
+        if self.is_ollama:
+            ollama_model_id = self.model_config.model_id.split("/")[-1]
+            model_info = ollama.show(ollama_model_id).modelinfo
+            if model_info is not None:
+                num_params = model_info.get("general.parameter_count")
+                if num_params is not None:
+                    return int(num_params)
+        # If it is a model accessed through the Hugging Face inference API then we can
+        # get the number of parameters from the Hugging Face model configuration from
+        # the Hugging Face Hub
         if self.model_config.model_id.startswith("huggingface/"):
             model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
             if HuggingFaceEncoderModel.model_exists(
@@ -329,10 +380,14 @@ class LiteLLMModel(BenchmarkModule):
         Returns:
             The vocabulary size of the model.
         """
+        # Start by trying out the regex mapping, and use the value if it matches
         for key, value in VOCAB_SIZE_MAPPING.items():
             if re.fullmatch(pattern=key, string=self.model_config.model_id) is not None:
                 return value
+        # If it is a model accessed through the Hugging Face inference API then we can
+        # get the vocabulary size from the Hugging Face model configuration from the
+        # Hugging Face Hub
         if self.model_config.model_id.startswith("huggingface/"):
             model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
             if HuggingFaceEncoderModel.model_exists(
@@ -379,10 +434,40 @@ class LiteLLMModel(BenchmarkModule):
         Returns:
             The maximum length of the model.
         """
+        # Start by trying out the regex mapping, and use the value if it matches
         for key, value in MODEL_MAX_LENGTH_MAPPING.items():
             if re.fullmatch(pattern=key, string=self.model_config.model_id) is not None:
                 return value
+        # If it is an Ollama model then we can get the maximum length from the Ollama
+        # Python SDK
+        if self.is_ollama:
+            ollama_model_id = self.model_config.model_id.split("/")[-1]
+            model_info = ollama.show(ollama_model_id).modelinfo
+            if model_info is not None:
+                context_length_keys = [
+                    key for key in model_info.keys() if "context_length" in key.lower()
+                ]
+                if context_length_keys:
+                    context_length = model_info[context_length_keys[0]]
+                    if context_length is not None:
+                        log_once(
+                            f"Detected context length key {context_length_keys[0]!r} "
+                            f"for Ollama model {ollama_model_id!r}",
+                            level=logging.DEBUG,
+                        )
+                        return int(context_length)
+                else:
+                    log_once(
+                        f"Tried to get the maximum length of the Ollama model "
+                        f"{ollama_model_id!r}, but could not find a context length. "
+                        f"The model info was {model_info}. Returning -1",
+                        level=logging.DEBUG,
+                    )
+        # If it is a model accessed through the Hugging Face inference API then we can
+        # get the maximum length from the Hugging Face model configuration from the
+        # Hugging Face Hub
         if self.model_config.model_id.startswith("huggingface/"):
             model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
             if HuggingFaceEncoderModel.model_exists(
@@ -523,6 +608,43 @@ class LiteLLMModel(BenchmarkModule):
         if model_id in litellm.model_list:
             return True
+        # If it is an Ollama model then try to download it
+        if model_id.startswith("ollama/") or model_id.startswith("ollama_chat/"):
+            ollama_model_id = model_id.split("/")[-1]
+            downloaded_ollama_models: list[str] = [
+                model_obj.model
+                for model_obj in ollama.list().models
+                if model_obj.model is not None
+            ]
+            if ollama_model_id not in downloaded_ollama_models:
+                try:
+                    response = ollama.pull(model=ollama_model_id, stream=True)
+                    with tqdm(
+                        desc=f"Downloading {ollama_model_id}",
+                        unit_scale=True,
+                        unit="B",
+                        leave=False,
+                    ) as pbar:
+                        for status in response:
+                            if status.total is not None:
+                                pbar.total = status.total
+                            if status.completed is not None:
+                                pbar.update(status.completed - pbar.n)
+                except ollama.ResponseError as e:
+                    if "file does not exist" in str(e).lower():
+                        return False
+                    else:
+                        raise InvalidModel(
+                            f"Failed to download Ollama model {ollama_model_id}. The "
+                            f"error message was: {e}"
+                        )
+            else:
+                log_once(
+                    f"Ollama model {ollama_model_id!r} already downloaded, so skipping "
+                    "download.",
+                    level=logging.DEBUG,
+                )
         num_attempts = 10
         for _ in range(num_attempts):
             try:

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -73,7 +73,6 @@ from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_conf
 if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
     from vllm import LLM, RequestOutput, SamplingParams
     from vllm.lora.request import LoRARequest
-    from vllm.sampling_params import GuidedDecodingParams
     try:
         from vllm.model_executor.parallel_utils.parallel_state import (
@@ -82,6 +81,10 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
     except ImportError:
         from vllm.distributed.parallel_state import destroy_model_parallel
+if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
+    from outlines.models.vllm import adapt_tokenizer
+    from outlines.processors import JSONLogitsProcessor
 if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
     import ray
@@ -319,12 +322,18 @@ class VLLMModel(HuggingFaceEncoderModel):
                 for tag_name in ner_tag_names
             }
             pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
-            schema = pydantic_class.model_json_schema()
-            guided_decoding = GuidedDecodingParams(
-                json=schema, backend="outlines", whitespace_pattern=r" ?"
+            logits_processor = JSONLogitsProcessor(
+                schema=pydantic_class,
+                tokenizer=adapt_tokenizer(tokenizer=self._tokenizer),  #  type: ignore
+                whitespace_pattern=r" ?",
+            )
+            log_once(
+                "Using structured generation with the schema "
+                f"{pydantic_class.model_json_schema()}",
+                level=logging.DEBUG,
             )
         else:
-            guided_decoding = None
+            logits_processor = None
         # Define the parameters used for vLLM generation
         max_tokens: int = (
@@ -337,7 +346,7 @@ class VLLMModel(HuggingFaceEncoderModel):
             logprobs=MAX_LOGPROBS if self.buffer["output_scores"] else None,
             temperature=0.0,
             stop=[stop_token for stop_token in stop_tokens if stop_token],
-            guided_decoding=guided_decoding,
+            logits_processors=[logits_processor] if logits_processor else None,
         )
         # If any of the prompts are empty then we need to replace them with a BOS token
@@ -881,8 +890,6 @@ def load_model_and_tokenizer(
         "max_position_embeddings",
         "max_sequence_length",
         "model_max_length",
-        "sliding_window",
-        "sliding_window_size",
         "n_positions",
     ]
     true_max_model_len_candidates: list[int] = list()
@@ -1087,7 +1094,8 @@ def get_end_of_reasoning_token_id(
     """Get the end of reasoning token ID for a generative model.
     This assumes that the reasoning token is of the form <X> and that the end of
-    reasoning token is </X> (for X being any string without spaces).
+    reasoning token is </X> (for X being any string without spaces). We disallow the
+    reasoning token to be the same as the beginning-of-sentence token.
     Args:
         model:
@@ -1106,6 +1114,7 @@ def get_end_of_reasoning_token_id(
             add_generation_prompt=True,
             tokenize=False,
         )
+    assert isinstance(prompt, str)
     # Generate a completion and remove the BOS token from it, to not confuse it with the
     # potential reasoning token
@@ -1119,11 +1128,18 @@ def get_end_of_reasoning_token_id(
         .text
     )
     if tokenizer.bos_token is not None:
-        completion = completion.replace(tokenizer.bos_token, "").strip()
+        if isinstance(tokenizer.bos_token, str):
+            prompt = prompt.replace(tokenizer.bos_token, "").strip()
+            completion = completion.replace(tokenizer.bos_token, "").strip()
+        elif isinstance(tokenizer.bos_token, list):
+            for bos_token in tokenizer.bos_token:
+                prompt = prompt.replace(bos_token, "").strip()
+                completion = completion.replace(bos_token, "").strip()
     # If it doesn't contain a reasoning token, we can't find the end of reasoning token
-    match = re.search(pattern=r"<\w+>", string=completion)
-    if match is None:
+    prompt_match = re.search(pattern=r"<\w+>", string=prompt)
+    completion_match = re.search(pattern=r"<\w+>", string=completion)
+    if completion_match is None and prompt_match is None:
         log_once(
             message=(
                 "Could not find a reasoning token, so assuming the model is not a "
@@ -1135,7 +1151,11 @@ def get_end_of_reasoning_token_id(
     # Check that the found reasoning token and its associated end-of-reasoning tokens
     # are both special tokens
-    reasoning_token = match.group()
+    elif completion_match is not None:
+        reasoning_token = completion_match.group()
+    else:
+        assert prompt_match is not None
+        reasoning_token = prompt_match.group()
     end_of_reasoning_token = f"</{reasoning_token[1:-1]}>"
     special_tokens = [
         decoder_token.content

euroeval/benchmarker.py CHANGED Viewed

@@ -709,7 +709,7 @@ class Benchmarker:
                 if dataset_config.task == SPEED:
                     scores = benchmark_speed(
-                        model=model, benchmark_config=self.benchmark_config
+                        model=model, benchmark_config=benchmark_config
                     )
                 else:
@@ -727,7 +727,7 @@ class Benchmarker:
                             datasets=prepared_datasets,
                             model_config=model_config,
                             dataset_config=dataset_config,
-                            benchmark_config=self.benchmark_config,
+                            benchmark_config=benchmark_config,
                         )
                     else:
                         scores = finetune(

euroeval/constants.py CHANGED Viewed

@@ -13,7 +13,13 @@ REASONING_MAX_TOKENS = 8_192
 # The Hugging Face Hub pipeline tags used to classify models as generative
-GENERATIVE_PIPELINE_TAGS = ["text-generation", "text2text-generation"]
+GENERATIVE_PIPELINE_TAGS = [
+    "text-generation",
+    "text2text-generation",
+    "image-text-to-text",
+    "audio-text-to-text",
+    "video-text-to-text",
+]
 # Used to disallow non-generative models to be evaluated on these task groups

euroeval/data_loading.py CHANGED Viewed

@@ -8,6 +8,7 @@ from datasets import Dataset, DatasetDict, load_dataset
 from datasets.exceptions import DatasetsError
 from huggingface_hub.errors import HfHubHTTPError
 from numpy.random import Generator
+from requests import ReadTimeout
 from .data_models import BenchmarkConfig, DatasetConfig
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark
@@ -47,7 +48,7 @@ def load_data(
                 token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
             )
             break
-        except (FileNotFoundError, DatasetsError, ConnectionError):
+        except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
             logger.warning(
                 f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
             )

euroeval/dataset_configs.py CHANGED Viewed

@@ -1,7 +1,22 @@
 """All dataset configurations used in EuroEval."""
 from .data_models import DatasetConfig
-from .languages import DA, DE, EN, FO, FR, IS, IT, NB, NL, NN, NO, SV, get_all_languages
+from .languages import (
+    DA,
+    DE,
+    EN,
+    ES,
+    FO,
+    FR,
+    IS,
+    IT,
+    NB,
+    NL,
+    NN,
+    NO,
+    SV,
+    get_all_languages,
+)
 from .tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SPEED, SUMM
@@ -265,6 +280,25 @@ SENTIPOLC_CONFIG = DatasetConfig(
 )
+SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
+    name="sentiment-headlines-es",
+    pretty_name="the truncated version of the Spanish sentiment headlines dataset",
+    huggingface_id="EuroEval/sentiment-headlines-es",
+    task=SENT,
+    languages=[ES],
+    labels=["negative", "neutral", "positive"],
+    prompt_prefix="Lo siguiente son reseñas y su sentimiento, que puede ser "
+    "'positivo', 'neutral' o 'negativo'.",
+    prompt_template="Texto: {text}\nSentimiento: {label}",
+    prompt_label_mapping=dict(
+        positive="positivo", neutral="neutral", negative="negativo"
+    ),
+    instruction_prompt="Texto: {text}\n\nClasifica el sentimiento de la reseña. "
+    "Responde con 'positivo', 'neutral' o 'negativo', y nada más.",
+    num_few_shot_examples=12,
+    max_generated_tokens=5,
+)
 ### NAMED ENTITY RECOGNITION DATASETS ###
 SUC3_CONFIG = DatasetConfig(
@@ -817,6 +851,45 @@ MULTINERD_IT_CONFIG = DatasetConfig(
     max_generated_tokens=128,
 )
+CONLL_ES_CONFIG = DatasetConfig(
+    name="conll-es",
+    pretty_name="the Spanish part of the truncated version of the named entity "
+    "recognition dataset CoNLL 2002",
+    huggingface_id="EuroEval/conll-es-mini",
+    task=NER,
+    languages=[ES],
+    labels=[
+        "o",
+        "b-loc",
+        "i-loc",
+        "b-org",
+        "i-org",
+        "b-per",
+        "i-per",
+        "b-misc",
+        "i-misc",
+    ],
+    prompt_prefix="Lo siguiente son oraciones y diccionarios JSON con las entidades "
+    "nombradas que aparecen en la oración dada.",
+    prompt_template="Oración: {text}\nEntidades nombradas: {label}",
+    prompt_label_mapping={
+        "b-per": "persona",
+        "i-per": "persona",
+        "b-loc": "lugar",
+        "i-loc": "lugar",
+        "b-org": "organización",
+        "i-org": "organización",
+        "b-misc": "misceláneo",
+        "i-misc": "misceláneo",
+    },
+    instruction_prompt="Oración: {text}\n\nIdentifica las entidades nombradas en la "
+    "oración. Debes producir esto como un diccionario JSON con las claves 'persona', "
+    "'lugar', 'organización' y 'misceláneo'. Los valores deben ser listas de las "
+    "entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
+    num_few_shot_examples=8,
+    max_generated_tokens=128,
+    unofficial=True,
+)
 ### LINGUISTIC ACCEPTABILITY DATASETS ###
@@ -1029,6 +1102,22 @@ SCALA_IT_CONFIG = DatasetConfig(
     max_generated_tokens=5,
 )
+SCALA_ES_CONFIG = DatasetConfig(
+    name="scala-es",
+    pretty_name="the Spanish part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-es",
+    task=LA,
+    languages=[ES],
+    labels=["incorrect", "correct"],
+    prompt_prefix="Lo siguiente son textos y si son gramaticalmente correctos.",
+    prompt_template="Texto: {text}\nGramaticalmente correcto: {label}",
+    prompt_label_mapping=dict(correct="sí", incorrect="no"),
+    instruction_prompt="Texto: {text}\n\nDetermina si el texto es gramaticalmente "
+    "correcto o no. Responde con 'sí' si el texto es correcto, y 'no' si no lo es.",
+    num_few_shot_examples=12,
+    max_generated_tokens=5,
+)
 DUTCH_COLA_CONFIG = DatasetConfig(
     name="dutch-cola",
     pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
@@ -1326,6 +1415,41 @@ FQUAD_CONFIG = DatasetConfig(
     max_generated_tokens=32,
 )
+XQUAD_ES_CONFIG = DatasetConfig(
+    name="xquad-es",
+    pretty_name="the Spanish version of the XQuAD reading comprehension dataset",
+    huggingface_id="EuroEval/xquad-es",
+    task=RC,
+    languages=[ES],
+    labels=["start_positions", "end_positions"],
+    prompt_prefix="A continuación se presentan textos con sus preguntas y respuestas "
+    "correspondientes.",
+    prompt_template="Texto: {text}\nPregunta: {question}\nRespuesta en máximo 3 "
+    "palabras: {label}",
+    instruction_prompt="Texto: {text}\n\nResponda la siguiente pregunta sobre el "
+    "texto anterior en máximo 3 palabras.\n\nPregunta: {question}",
+    num_few_shot_examples=4,
+    max_generated_tokens=32,
+    unofficial=True,
+)
+MLQA_ES_CONFIG = DatasetConfig(
+    name="mlqa-es",
+    pretty_name="the Spanish version of the MLQA reading comprehension dataset",
+    huggingface_id="EuroEval/mlqa-es",
+    task=RC,
+    languages=[ES],
+    labels=["start_positions", "end_positions"],
+    prompt_prefix="A continuación se presentan textos con sus preguntas y respuestas "
+    "correspondientes.",
+    prompt_template="Texto: {text}\nPregunta: {question}\nRespuesta en máximo 3 "
+    "palabras: {label}",
+    instruction_prompt="Texto: {text}\n\nResponda la siguiente pregunta sobre el "
+    "texto anterior en máximo 3 palabras.\n\nPregunta: {question}",
+    num_few_shot_examples=4,
+    max_generated_tokens=32,
+)
 ### SUMMARIZATION DATASETS ###
 NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
@@ -1358,6 +1482,19 @@ MLSUM_CONFIG = DatasetConfig(
     max_generated_tokens=256,
 )
+MLSUM_ES_CONFIG = DatasetConfig(
+    name="mlsum-es",
+    pretty_name="the truncated version of the Spanish summarisation dataset MLSum",
+    huggingface_id="EuroEval/mlsum-es-mini",
+    task=SUMM,
+    languages=[ES],
+    prompt_prefix="Los siguientes son artículos de noticias con sus resúmenes.",
+    prompt_template="Artículo: {text}\nResumen: {target_text}",
+    instruction_prompt="Artículo: {text}\n\nEscribe un resumen del artículo anterior.",
+    num_few_shot_examples=1,
+    max_generated_tokens=256,
+)
 RRN_CONFIG = DatasetConfig(
     name="rrn",
     pretty_name="the truncated version of the Icelandic summarisation dataset "
@@ -1745,6 +1882,23 @@ MMLU_IT_CONFIG = DatasetConfig(
     max_generated_tokens=5,
 )
+MMLU_ES_CONFIG = DatasetConfig(
+    name="mmlu-es",
+    pretty_name="the truncated version of the Spanish knowledge dataset MMLU-es, "
+    "translated from the English MMLU dataset",
+    huggingface_id="EuroEval/mmlu-es-mini",
+    task=KNOW,
+    languages=[ES],
+    labels=["a", "b", "c", "d"],
+    prompt_prefix="Las siguientes son preguntas de opción múltiple (con respuestas).",
+    prompt_template="Pregunta: {text}\nRespuesta: {label}",
+    prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
+    instruction_prompt="Pregunta: {text}\n\nResponda la pregunta anterior usando "
+    "solo 'a', 'b', 'c' o 'd', y nada más.",
+    num_few_shot_examples=5,
+    max_generated_tokens=5,
+)
 ARC_DA_CONFIG = DatasetConfig(
     name="arc-da",
     pretty_name="the truncated version of the Danish knowledge dataset ARC-da, "
@@ -1870,6 +2024,23 @@ ARC_CONFIG = DatasetConfig(
     unofficial=True,
 )
+HELLASWAG_ES_CONFIG = DatasetConfig(
+    name="hellaswag-es",
+    pretty_name="the truncated version of the Spanish common-sense reasoning dataset "
+    "HellaSwag-es, translated from the English HellaSwag dataset",
+    huggingface_id="EuroEval/hellaswag-es-mini",
+    task=COMMON_SENSE,
+    languages=[ES],
+    labels=["a", "b", "c", "d"],
+    prompt_prefix="Las siguientes son preguntas de opción múltiple (con respuestas).",
+    prompt_template="Pregunta: {text}\nRespuesta: {label}",
+    prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
+    instruction_prompt="Pregunta: {text}\n\nResponda la pregunta anterior usando solo "
+    "'a', 'b', 'c' o 'd', y nada más.",
+    num_few_shot_examples=5,
+    max_generated_tokens=5,
+)
 # TODO: Faroese knowledge

euroeval/task_utils/token_classification.py CHANGED Viewed

@@ -1,18 +1,18 @@
 """Utility functions related to the token-classification task group."""
-import importlib.util
 import logging
 import re
 import typing as t
 from copy import deepcopy
+import demjson3
 import evaluate
 import numpy as np
 from evaluate import EvaluationModule
 from transformers import PreTrainedTokenizer
 from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
-from ..exceptions import InvalidBenchmark, NeedsExtraInstalled
+from ..exceptions import InvalidBenchmark
 from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
@@ -20,9 +20,6 @@ if t.TYPE_CHECKING:
     from ..types import Labels, Predictions
-if importlib.util.find_spec("demjson3") is not None:
-    import demjson3
 logger = logging.getLogger("euroeval")
@@ -201,13 +198,10 @@ def extract_labels_from_generation(
     Returns:
         The predicted labels.
     """
-    if importlib.util.find_spec("demjson3") is None:
-        raise NeedsExtraInstalled(extra="generative")
     raw_predictions = model_output.sequences
     # Attempt to extract the JSON dictionary from the predictions
-    json_regex = r"\{.+?\}"
+    json_regex = r"\{[^{}]+?\}"
     json_matches = [
         re.search(pattern=json_regex, string=raw_prediction, flags=re.DOTALL)
         or raw_prediction

euroeval/utils.py CHANGED Viewed

@@ -141,6 +141,7 @@ def block_terminal_output() -> None:
     logging.getLogger("vllm.transformers_utils.tokenizer").setLevel(logging.CRITICAL)
     logging.getLogger("vllm.core.scheduler").setLevel(logging.CRITICAL)
     logging.getLogger("vllm.model_executor.weight_utils").setLevel(logging.CRITICAL)
+    logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
     logging.getLogger("httpx").setLevel(logging.CRITICAL)
     logging.getLogger("ray._private.worker").setLevel(logging.CRITICAL)
     logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)

{euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.3.1
+Version: 15.4.0
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -33,12 +33,14 @@ Requires-Dist: accelerate>=0.34.2
 Requires-Dist: bert-score>=0.3.13
 Requires-Dist: click>=8.1.3
 Requires-Dist: datasets>=2.15.0
+Requires-Dist: demjson3>=3.0.6
 Requires-Dist: evaluate>=0.4.1
 Requires-Dist: huggingface-hub>=0.24.0
 Requires-Dist: levenshtein>=0.24.0
 Requires-Dist: litellm>=1.61.13
 Requires-Dist: more-itertools>=10.5.0
 Requires-Dist: numpy<2.0.0,>=1.23.0
+Requires-Dist: ollama>=0.4.7
 Requires-Dist: pandas>=2.2.0
 Requires-Dist: protobuf~=3.20.0
 Requires-Dist: pydantic>=2.6.0
@@ -52,19 +54,19 @@ Requires-Dist: seqeval>=1.2.2
 Requires-Dist: setuptools>=75.8.2
 Requires-Dist: tenacity>=9.0.0
 Requires-Dist: termcolor>=2.0.0
-Requires-Dist: torch>=2.3.0
-Requires-Dist: transformers>=4.47.0
+Requires-Dist: torch>=2.6.0
+Requires-Dist: transformers>=4.50.0
 Provides-Extra: all
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
-Requires-Dist: demjson3>=3.0.6; extra == 'all'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: gradio>=4.26.0; extra == 'all'
-Requires-Dist: vllm<0.6.5,>=0.6.3; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: outlines>=0.1.11; extra == 'all'
+Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
 Provides-Extra: generative
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
-Requires-Dist: demjson3>=3.0.6; extra == 'generative'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
-Requires-Dist: vllm<0.6.5,>=0.6.3; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: outlines>=0.1.11; extra == 'generative'
+Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
 Provides-Extra: human-evaluation
 Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
 Provides-Extra: test
@@ -202,6 +204,19 @@ argument. This could for instance be `--model <model-id> --task
 sentiment-classification`.
+### Reproducing the datasets
+All datasets used in this project are generated using the scripts located in the [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script with the following command
+```shell
+$ uv run src/scripts/<name-of-script>.py
+```
+Replace <name-of-script> with the specific script you wish to execute, e.g.,
+```shell
+$ uv run src/scripts/create_allocine.py
+```
 ## Special Thanks :pray:
 - Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
   models on the leaderboards.

{euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-euroeval/__init__.py,sha256=3od9_ucHlILSbe4WCR8k5PbeorvmUr-VjOKXJ01I0fA,2165
-euroeval/benchmark_config_factory.py,sha256=pi4Lu--ySKZRd9ItG6VKS6BPLis64vL-7UE99VSXq5Y,12534
-euroeval/benchmarker.py,sha256=__DdnOvI9CNpgqPT1hsTl0GZFTyQ6KRfiQowCuh36sc,46534
+euroeval/__init__.py,sha256=l3V3ybiCj0I193jvn8wS9VK4UEc9ajiOq4SojChH6Xs,2615
+euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
+euroeval/benchmarker.py,sha256=PIdqLPleLN3nml5Zb1g_dQaLzqxQhmgC8VuvD5yloV4,46524
 euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
 euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
-euroeval/constants.py,sha256=qFrm3cRT6UlnTXfHUmxqZsr0SBsGskjV1qrUlnAW-aw,1473
-euroeval/data_loading.py,sha256=RoatBJMpGurP_y5O3KrEvly8Z_yYEapQnnMZ_tWWrlc,3272
+euroeval/constants.py,sha256=9iXe26WAigL9RYob3PhsB5c0dr11wCeRxrEfm_ssynM,1562
+euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
 euroeval/data_models.py,sha256=4ZY9x2pINlRywTzYxxtrYG7qXMNdod5I9XBOlTJYT8E,14495
-euroeval/dataset_configs.py,sha256=Cj3McxA0JTC7RKzXofzpJfmIhoXAfF756f_1SZUaPlw,84391
+euroeval/dataset_configs.py,sha256=bjMUXvaEtTpo1Eql_mIRCG3K_lB2DZRdPWEAwR5N4ig,90627
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
 euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
 euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
@@ -20,21 +20,21 @@ euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
 euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
 euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
 euroeval/types.py,sha256=xvBn0eNynqAqwL7CGEgVFb_lCD9SdHUMvxJo7OXRfls,2367
-euroeval/utils.py,sha256=K4z2IQilLJo6Cf8bzM46PYTaylDv6bYi7FRbHTbZulE,18736
+euroeval/utils.py,sha256=MkiVI-0KmK4ilKJTTfYAynKaPDOzW1WjyRdZsYmnoIg,18803
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
 euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
-euroeval/benchmark_modules/hf.py,sha256=n3VIUA7XOOTgbSMkmYp5S06iJV0kp7aMq8YzRb0EDLw,41741
-euroeval/benchmark_modules/litellm.py,sha256=uMPzUjTU54UHDmBImzWUFCGUupKvZNQN-2u0c8UaM3s,34488
-euroeval/benchmark_modules/vllm.py,sha256=cw7onFYXQ66cr2c4WTB90VYtQYc47lkwz6A25FW8sBs,43444
+euroeval/benchmark_modules/hf.py,sha256=YeaaP_YGAlKG5G1KFq0bFOFWv42eH_zfmhuW3FAXjAA,41726
+euroeval/benchmark_modules/litellm.py,sha256=ZJ9dB683pXPHDf70OOJfmHn_y706xRYzstYLz2ytCKE,39784
+euroeval/benchmark_modules/vllm.py,sha256=5N2ytLR9cZIcPeza-ERQWwyvehDd0F1FUvXY3cKu4Oo,44519
 euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
 euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
 euroeval/task_utils/sequence_classification.py,sha256=FrkvFzxFSnZoXThgpQqvJCIy3_YemyqZFQ1L-YdMMiw,8527
 euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
-euroeval/task_utils/token_classification.py,sha256=yT1YvZzmqNaVSRZ67BvyURhlkgTm3ltWPft4HxodZAE,17983
-euroeval-15.3.1.dist-info/METADATA,sha256=elF7s_zt2tj9Hl1EMMDfNoMtskYK5Xh9i-N36vvzfQs,10263
-euroeval-15.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.3.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.3.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
-euroeval-15.3.1.dist-info/RECORD,,
+euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
+euroeval-15.4.0.dist-info/METADATA,sha256=HfNWsANdb8TJAyK__QPBhs7O5qsQp9G_gPlhVVNuK9c,10724
+euroeval-15.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.4.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.4.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
+euroeval-15.4.0.dist-info/RECORD,,

{euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.3.1__py3-none-any.whl → 15.4.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.3.1py3-none-any.whl → 15.4.0py3-none-any.whl