PyPI - EuroEval - Versions diffs - 15.14.0__py3-none-any.whl → 15.16.0__py3-none-any.whl - Mend

EuroEval 15.14.0py3-none-any.whl → 15.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (28) hide show

euroeval/__init__.py +7 -0
euroeval/benchmark_modules/litellm.py +155 -105
euroeval/benchmark_modules/vllm.py +21 -15
euroeval/benchmarker.py +10 -11
euroeval/data_models.py +1 -1
euroeval/dataset_configs/danish.py +10 -0
euroeval/dataset_configs/dutch.py +10 -0
euroeval/dataset_configs/finnish.py +10 -0
euroeval/dataset_configs/french.py +10 -0
euroeval/dataset_configs/german.py +10 -0
euroeval/dataset_configs/italian.py +10 -0
euroeval/dataset_configs/spanish.py +10 -0
euroeval/dataset_configs/swedish.py +10 -0
euroeval/finetuning.py +2 -1
euroeval/generation.py +1 -1
euroeval/human_evaluation.py +2 -1
euroeval/metrics.py +22 -4
euroeval/prompt_templates/multiple_choice.py +1 -1
euroeval/task_group_utils/question_answering.py +7 -1
euroeval/task_group_utils/sequence_classification.py +8 -1
euroeval/task_group_utils/text_to_text.py +8 -1
euroeval/task_group_utils/token_classification.py +9 -2
euroeval/types.py +5 -0
{euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/METADATA +5 -6
{euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/RECORD +28 -28
{euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/WHEEL +0 -0
{euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -86,6 +86,13 @@ os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+# Allow long max model length in vLLM. This happens when vLLM registers that the model
+# has a shorter context length than the value we are inserting. But since we do a
+# thorough check of the model's config before setting the context length, we trust our
+# own checks and ignore the internal vLLM check.
+os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
 # Avoid the "Unclosed client session" error when evaluating Ollama models with LiteLLM.
 # The error comes from the `aiohttp` package, and this environment variable forces the
 # use of `httpx` instead.

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -6,7 +6,7 @@ import logging
 import os
 import re
 import typing as t
-from functools import cached_property, partial
+from functools import cache, cached_property, partial
 from time import sleep
 import litellm
@@ -27,6 +27,7 @@ from litellm.exceptions import (
     RateLimitError,
     ServiceUnavailableError,
     Timeout,
+    UnsupportedParamsError,
 )
 from litellm.llms.vertex_ai.common_utils import VertexAIError
 from litellm.router import Router
@@ -87,6 +88,7 @@ logger = logging.getLogger("euroeval")
 VOCAB_SIZE_MAPPING = {
     # OpenAI models
+    r"gpt-5-.*": 100_256,
     r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
     r"gpt-4-[0-9]{4}-preview": 100_256,
     r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
@@ -105,6 +107,7 @@ VOCAB_SIZE_MAPPING = {
 MODEL_MAX_LENGTH_MAPPING = {
     # OpenAI models
+    r"gpt-5-.*": 272_000,
     r"gpt-4(-[0-9]{4})?": 8_191,
     r"gpt-4-32k(-[0-9]{4})?": 32_767,
     r"gpt-4-[0-9]{4}-preview": 128_000,
@@ -129,6 +132,7 @@ MODEL_MAX_LENGTH_MAPPING = {
 NUM_PARAMS_MAPPING = {
     # OpenAI models
+    r"gpt-5-.*": -1,
     r"gpt-4.*": -1,
     r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
     # Anthropic models
@@ -144,6 +148,7 @@ NUM_PARAMS_MAPPING = {
 ALLOWED_PARAMS = {
     # OpenAI models
+    r"gpt-5-.*": ["minimal", "low", "medium", "high"],
     r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
     # Anthropic models
     r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
@@ -269,28 +274,9 @@ class LiteLLMModel(BenchmarkModule):
             generative_type=self.generative_type,
         )
-        # Set the core generation arguments
-        generation_kwargs: dict[str, t.Any] = dict(
-            model=self.model_config.model_id,
-            max_completion_tokens=(
-                REASONING_MAX_TOKENS
-                if self.generative_type == GenerativeType.REASONING
-                else self.dataset_config.max_generated_tokens
-            ),
-            stop=[],
-            temperature=0.0,
-            seed=4242,
-            api_key=self.benchmark_config.api_key,
-            api_base=self.benchmark_config.api_base,
-            api_version=self.benchmark_config.api_version,
-            max_retries=3,
-        )
-        # Set up the `response_format` generation argument if we are dealing with a task
-        # using structured generation
+        # Sanity check that "JSON" is included in the prompt, as some models require
+        # this
         if self.dataset_config.task in TASKS_USING_JSON:
-            # Sanity check that "JSON" is included in the prompt, as some models require
-            # this
             for conversation in conversations:
                 if not conversation:
                     raise InvalidBenchmark(
@@ -310,87 +296,6 @@ class LiteLLMModel(BenchmarkModule):
                     "Prompt must contain 'json' for JSON tasks."
                 )
-            if self.generative_type == GenerativeType.REASONING:
-                log_once(
-                    f"The model {self.model_config.model_id!r} is a reasoning model "
-                    "and thus does not support structured generation, so we do not "
-                    "enable it.",
-                    level=logging.DEBUG,
-                )
-            elif supports_response_schema(model=self.model_config.model_id):
-                ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
-                keys_and_their_types: dict[str, t.Any] = {
-                    tag_name: (conlist(str, max_length=5), ...)
-                    for tag_name in ner_tag_names
-                }
-                pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
-                generation_kwargs["response_format"] = pydantic_class
-                log_once(
-                    "Enabling structured generation for model "
-                    f"{self.model_config.model_id!r} with the JSON schema "
-                    f"{pydantic_class.model_json_schema()}",
-                    level=logging.DEBUG,
-                )
-            else:
-                generation_kwargs["response_format"] = dict(type="json_object")
-                log_once(
-                    "Enabling structured JSON generation for model "
-                    f"{self.model_config.model_id!r} with no custom JSON schema, as "
-                    "the model does not support schemas.",
-                    level=logging.DEBUG,
-                )
-        # If the model is an Ollama reasoning model, we ensure that thinking is enabled
-        if self.is_ollama and self.generative_type == GenerativeType.REASONING:
-            generation_kwargs["think"] = True
-            log_once(
-                "Enabling thinking mode for Ollama model "
-                f"{self.model_config.model_id!r}",
-                level=logging.DEBUG,
-            )
-        # Handle manually set parameters
-        if self.buffer["first_label_token_mapping"]:
-            generation_kwargs["logprobs"] = True
-            generation_kwargs["top_logprobs"] = MAX_LOGPROBS
-        if self.model_config.revision == "thinking":
-            generation_kwargs["thinking"] = dict(
-                type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
-            )
-            log_once(
-                f"Enabling thinking mode for model {self.model_config.model_id!r}",
-                level=logging.DEBUG,
-            )
-        elif self.model_config.revision == "no-thinking":
-            generation_kwargs["thinking"] = dict(budget_tokens=0)
-            log_once(
-                f"Disabling thinking mode for model {self.model_config.model_id!r}",
-                level=logging.DEBUG,
-            )
-        elif self.model_config.revision in {"low", "medium", "high"}:
-            generation_kwargs["reasoning_effort"] = self.model_config.revision
-            log_once(
-                f"Enabling reasoning effort {self.model_config.revision!r} for model "
-                f"{self.model_config.model_id!r}",
-                level=logging.DEBUG,
-            )
-        # Drop generation kwargs that are not supported by the model
-        litellm.drop_params = True
-        # First attempt is a test run with a single conversation to handle errors
-        # quickly
-        test_conversation = conversations[0]
-        _, failures = safe_run(
-            self._generate_async(
-                model_id=self.model_config.model_id,
-                conversations=[test_conversation],
-                **generation_kwargs,
-            )
-        )
-        for _, error in failures:
-            self._handle_exception(error=error, generation_kwargs=generation_kwargs)
         all_responses: dict[int, "ModelResponse"] = {}
         conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
             enumerate(conversations)
@@ -404,7 +309,7 @@ class LiteLLMModel(BenchmarkModule):
                 self._generate_async(
                     model_id=self.model_config.model_id,
                     conversations=list(batch_conversations),
-                    **generation_kwargs,
+                    **self.get_generation_kwargs(dataset_config=self.dataset_config),
                 )
             )
@@ -431,7 +336,12 @@ class LiteLLMModel(BenchmarkModule):
             # Attempt to handle the exceptions, to improve the chance of getting
             # successful generations next time around
             for _, error in failures:
-                self._handle_exception(error=error, generation_kwargs=generation_kwargs)
+                self._handle_exception(
+                    error=error,
+                    generation_kwargs=self.get_generation_kwargs(
+                        dataset_config=self.dataset_config
+                    ),
+                )
             # Sleep for a second to avoid pinging the API server too quickly
             sleep(1)
@@ -484,6 +394,7 @@ class LiteLLMModel(BenchmarkModule):
             "`temperature` may only be set to 1",
             "'temperature' does not support 0.0 with this model. Only the default "
             "(1) value is supported",
+            "Only temperature=1 is supported",
         ]
         max_items_messages = ["'maxItems' is not permitted."]
         no_json_schema_messages = ["Property keys should match pattern"]
@@ -593,6 +504,20 @@ class LiteLLMModel(BenchmarkModule):
             )
             sleep(5)
             return
+        elif isinstance(error, UnsupportedParamsError):
+            unsupported_param_match = re.search(
+                pattern=r"(?<=does not support parameters\: \[')([^ ']+)(?='\])",
+                string=error.message,
+            )
+            if unsupported_param_match is None:
+                raise InvalidModel(error.message)
+            else:
+                unsupported_param = unsupported_param_match.group(0)
+                raise InvalidModel(
+                    f"The model {model_id!r} does not support the parameter "
+                    f"{unsupported_param!r}. Try again without this parameter. "
+                    "Skipping this model."
+                )
         elif isinstance(error, (APIConnectionError, OSError)):
             # If there are too many I/O connections, we increase the number of allowed
             # file descriptors
@@ -1233,6 +1158,126 @@ class LiteLLMModel(BenchmarkModule):
         return dataset
+    @cache
+    def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
+        """Get the generation arguments for the model.
+        Args:
+            dataset_config:
+                The dataset configuration, which is used to determine the generative
+                type of the model. We use this as an argument here rather than using
+                `self.dataset_config` to ensure that that the cache is updated when the
+                dataset configuration changes.
+        Returns:
+            The generation arguments for the model.
+        """
+        # Set the core generation arguments
+        generation_kwargs: dict[str, t.Any] = dict(
+            model=self.model_config.model_id,
+            max_completion_tokens=(
+                REASONING_MAX_TOKENS
+                if self.generative_type == GenerativeType.REASONING
+                else dataset_config.max_generated_tokens
+            ),
+            stop=[],
+            temperature=0.0,
+            seed=4242,
+            api_key=self.benchmark_config.api_key,
+            api_base=self.benchmark_config.api_base,
+            api_version=self.benchmark_config.api_version,
+            max_retries=3,
+        )
+        # Set up the `response_format` generation argument if we are dealing with a task
+        # using structured generation
+        if dataset_config.task in TASKS_USING_JSON:
+            if self.generative_type == GenerativeType.REASONING:
+                log_once(
+                    f"The model {self.model_config.model_id!r} is a reasoning model "
+                    "and thus does not support structured generation, so we do not "
+                    "enable it.",
+                    level=logging.DEBUG,
+                )
+            elif supports_response_schema(model=self.model_config.model_id):
+                ner_tag_names = list(dataset_config.prompt_label_mapping.values())
+                keys_and_their_types: dict[str, t.Any] = {
+                    tag_name: (conlist(str, max_length=5), ...)
+                    for tag_name in ner_tag_names
+                }
+                pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
+                generation_kwargs["response_format"] = pydantic_class
+                log_once(
+                    "Enabling structured generation for model "
+                    f"{self.model_config.model_id!r} with the JSON schema "
+                    f"{pydantic_class.model_json_schema()}",
+                    level=logging.DEBUG,
+                )
+            else:
+                generation_kwargs["response_format"] = dict(type="json_object")
+                log_once(
+                    "Enabling structured JSON generation for model "
+                    f"{self.model_config.model_id!r} with no custom JSON schema, as "
+                    "the model does not support schemas.",
+                    level=logging.DEBUG,
+                )
+        # If the model is an Ollama reasoning model, we ensure that thinking is enabled
+        if self.is_ollama and self.generative_type == GenerativeType.REASONING:
+            generation_kwargs["think"] = True
+            log_once(
+                "Enabling thinking mode for Ollama model "
+                f"{self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        # Handle manually set parameters
+        if self.buffer["first_label_token_mapping"]:
+            generation_kwargs["logprobs"] = True
+            generation_kwargs["top_logprobs"] = MAX_LOGPROBS
+        if self.model_config.revision == "thinking":
+            generation_kwargs["thinking"] = dict(
+                type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
+            )
+            log_once(
+                f"Enabling thinking mode for model {self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        elif self.model_config.revision == "no-thinking":
+            generation_kwargs["thinking"] = dict(budget_tokens=0)
+            log_once(
+                f"Disabling thinking mode for model {self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        elif self.model_config.revision in {"minimal", "low", "medium", "high"}:
+            generation_kwargs["reasoning_effort"] = self.model_config.revision
+            log_once(
+                f"Enabling reasoning effort {self.model_config.revision!r} for model "
+                f"{self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        # First attempt is a test run with a single conversation to handle errors
+        # quickly. We repeat this multiple times to deal with different types of
+        # errors, and stop if we get a successful response.
+        test_conversation = [
+            litellm.ChatCompletionUserMessage(role="user", content="Test message")
+        ]
+        for _ in range(5):
+            _, failures = safe_run(
+                self._generate_async(
+                    model_id=self.model_config.model_id,
+                    conversations=[test_conversation],
+                    **generation_kwargs,
+                )
+            )
+            if not failures:
+                break
+            for _, error in failures:
+                self._handle_exception(error=error, generation_kwargs=generation_kwargs)
+        return generation_kwargs
 def raise_if_wrong_params(
     model_config: ModelConfig, allowed_params: dict[str, list[str]]
@@ -1264,6 +1309,11 @@ def raise_if_wrong_params(
                     msg += " No parameters are allowed."
                 raise InvalidModel(msg)
             return
+    else:
+        raise InvalidModel(
+            f"The parameter {param!r} is not supported for the model "
+            f"{model_config.model_id!r}."
+        )
 def try_download_ollama_model(model_id: str) -> bool:

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -77,10 +77,7 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
         destroy_model_parallel,
     )
     from vllm.lora.request import LoRARequest
-if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
-    from outlines.models.vllm import adapt_tokenizer
-    from outlines.processors.structured import JSONLogitsProcessor
+    from vllm.sampling_params import GuidedDecodingParams
 if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
     import ray
@@ -171,7 +168,8 @@ class VLLMModel(HuggingFaceEncoderModel):
     def __del__(self) -> None:
         """Clean up the model and tokenizer."""
-        clear_vllm()
+        if importlib.util.find_spec("vllm") is not None:
+            clear_vllm()
         if hasattr(self, "_model"):
             del self._model
         if hasattr(self, "_tokenizer"):
@@ -327,7 +325,7 @@ class VLLMModel(HuggingFaceEncoderModel):
             if end_of_chat_token:
                 stop_tokens.append(end_of_chat_token)
-        logits_processor = None
+        structured_generation_schema = None
         if self.dataset_config.task in TASKS_USING_JSON:
             if self.generative_type == GenerativeType.REASONING:
                 log_once(
@@ -342,15 +340,13 @@ class VLLMModel(HuggingFaceEncoderModel):
                     tag_name: (conlist(str, max_length=5), ...)
                     for tag_name in ner_tag_names
                 }
-                pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
-                logits_processor = JSONLogitsProcessor(
-                    schema=pydantic_class,
-                    tokenizer=adapt_tokenizer(tokenizer=self._tokenizer),  # type: ignore
-                    whitespace_pattern=r" ?",
+                answer_format_class = create_model(
+                    "AnswerFormat", **keys_and_their_types
                 )
+                structured_generation_schema = answer_format_class.model_json_schema()
                 log_once(
                     "Using structured generation with the JSON schema "
-                    f"{pydantic_class.model_json_schema()}",
+                    f"{structured_generation_schema}",
                     level=logging.DEBUG,
                 )
@@ -374,7 +370,11 @@ class VLLMModel(HuggingFaceEncoderModel):
             logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
             temperature=0.0,
             stop=[stop_token for stop_token in stop_tokens if stop_token],
-            logits_processors=[logits_processor] if logits_processor else None,
+            guided_decoding=(
+                GuidedDecodingParams(json=structured_generation_schema)
+                if structured_generation_schema
+                else None
+            ),
         )
         # If any of the prompts are empty then we need to replace them with a BOS token
@@ -691,8 +691,14 @@ def load_model_and_tokenizer(
             )
             dtype = torch.float16
-    # If the model is a quantized model, we need to set the dtype to float16
-    if quantization is not None and hf_model_config.torch_dtype != torch.float16:
+    # If the model is a quantized model, we might need to change the dtype
+    if quantization == "mxfp4" and hf_model_config.torch_dtype is None:
+        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        logger.debug(
+            "You are loading a quantized model where `torch_dtype` has not been set. "
+            f"Setting dtype to {dtype!r}."
+        )
+    elif quantization is not None and hf_model_config.torch_dtype != torch.float16:
         logger.info(
             "You are loading a quantized model with dtype "
             f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting "

euroeval/benchmarker.py CHANGED Viewed

@@ -379,7 +379,16 @@ class Benchmarker:
         current_benchmark_results: list[BenchmarkResult] = list()
         for model_id in model_ids:
-            model_config: ModelConfig | None = None
+            # Load the model configuration, or skip the model if it is invalid
+            try:
+                model_config = get_model_config(
+                    model_id=model_id, benchmark_config=benchmark_config
+                )
+            except InvalidModel as e:
+                logger.info(e.message)
+                num_finished_benchmarks += len(dataset_configs)
+                continue
             loaded_model: BenchmarkModule | None = None
             for dataset_config in dataset_configs:
                 # Skip if we have already benchmarked this model on this dataset and
@@ -399,16 +408,6 @@ class Benchmarker:
                     num_finished_benchmarks += 1
                     continue
-                if model_config is None:
-                    try:
-                        model_config = get_model_config(
-                            model_id=model_id, benchmark_config=benchmark_config
-                        )
-                    except InvalidModel as e:
-                        logger.info(e.message)
-                        num_finished_benchmarks += len(dataset_configs)
-                        continue
                 # Skip if the model is an encoder model and the task is generative
                 task_is_generative = (
                     dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS

euroeval/data_models.py CHANGED Viewed

@@ -259,7 +259,7 @@ class BenchmarkResult(pydantic.BaseModel):
     transformers_version: str | None = get_package_version("transformers")
     torch_version: str | None = get_package_version("torch")
     vllm_version: str | None = get_package_version("vllm")
-    outlines_version: str | None = get_package_version("outlines")
+    xgrammar_version: str | None = get_package_version("xgrammar")
     @classmethod
     def from_dict(cls, config: dict) -> "BenchmarkResult":

euroeval/dataset_configs/danish.py CHANGED Viewed

@@ -128,3 +128,13 @@ MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
     languages=[DA],
     unofficial=True,
 )
+GOLDENSWAG_DA_CONFIG = DatasetConfig(
+    name="goldenswag-da",
+    pretty_name="the truncated version of the Danish common-sense reasoning "
+    "dataset GoldenSwag-da, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-da-mini",
+    task=COMMON_SENSE,
+    languages=[DA],
+    unofficial=True,
+)

euroeval/dataset_configs/dutch.py CHANGED Viewed

@@ -120,3 +120,13 @@ MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
     languages=[NL],
     unofficial=True,
 )
+GOLDENSWAG_NL_CONFIG = DatasetConfig(
+    name="goldenswag-nl",
+    pretty_name="the truncated version of the Dutch common-sense reasoning "
+    "dataset GoldenSwag-nl, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-nl-mini",
+    task=COMMON_SENSE,
+    languages=[NL],
+    unofficial=True,
+)

euroeval/dataset_configs/finnish.py CHANGED Viewed

@@ -78,3 +78,13 @@ MULTI_WIKI_QA_FI_CONFIG = DatasetConfig(
     languages=[FI],
     unofficial=True,
 )
+GOLDENSWAG_FI_CONFIG = DatasetConfig(
+    name="goldenswag-fi",
+    pretty_name="the truncated version of the Finnish common-sense reasoning "
+    "dataset GoldenSwag-fi, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-fi-mini",
+    task=COMMON_SENSE,
+    languages=[FI],
+    unofficial=True,
+)

euroeval/dataset_configs/french.py CHANGED Viewed

@@ -91,3 +91,13 @@ MULTI_WIKI_QA_FR_CONFIG = DatasetConfig(
     languages=[FR],
     unofficial=True,
 )
+GOLDENSWAG_FR_CONFIG = DatasetConfig(
+    name="goldenswag-fr",
+    pretty_name="the truncated version of the French common-sense reasoning "
+    "dataset GoldenSwag-fr, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-fr-mini",
+    task=COMMON_SENSE,
+    languages=[FR],
+    unofficial=True,
+)

euroeval/dataset_configs/german.py CHANGED Viewed

@@ -99,3 +99,13 @@ MULTI_WIKI_QA_DE_CONFIG = DatasetConfig(
     languages=[DE],
     unofficial=True,
 )
+GOLDENSWAG_DE_CONFIG = DatasetConfig(
+    name="goldenswag-de",
+    pretty_name="the truncated version of the German common-sense reasoning "
+    "dataset GoldenSwag-de, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-de-mini",
+    task=COMMON_SENSE,
+    languages=[DE],
+    unofficial=True,
+)

euroeval/dataset_configs/italian.py CHANGED Viewed

@@ -99,3 +99,13 @@ MULTI_WIKI_QA_IT_CONFIG = DatasetConfig(
     languages=[IT],
     unofficial=True,
 )
+GOLDENSWAG_IT_CONFIG = DatasetConfig(
+    name="goldenswag-it",
+    pretty_name="the truncated version of the Italian common-sense reasoning "
+    "dataset GoldenSwag-it, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-it-mini",
+    task=COMMON_SENSE,
+    languages=[IT],
+    unofficial=True,
+)

euroeval/dataset_configs/spanish.py CHANGED Viewed

@@ -97,3 +97,13 @@ MULTI_WIKI_QA_ES_CONFIG = DatasetConfig(
     languages=[ES],
     unofficial=True,
 )
+GOLDENSWAG_ES_CONFIG = DatasetConfig(
+    name="goldenswag-es",
+    pretty_name="the truncated version of the Spanish common-sense reasoning "
+    "dataset GoldenSwag-es, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-es-mini",
+    task=COMMON_SENSE,
+    languages=[ES],
+    unofficial=True,
+)

euroeval/dataset_configs/swedish.py CHANGED Viewed

@@ -108,3 +108,13 @@ MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
     languages=[SV],
     unofficial=True,
 )
+GOLDENSWAG_SV_CONFIG = DatasetConfig(
+    name="goldenswag-sv",
+    pretty_name="the truncated version of the Swedish common-sense reasoning "
+    "dataset GoldenSwag-sv, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-sv-mini",
+    task=COMMON_SENSE,
+    languages=[SV],
+    unofficial=True,
+)

euroeval/finetuning.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import logging
 import sys
 import typing as t
+from functools import partial
 import torch
 from tqdm.auto import tqdm
@@ -198,7 +199,7 @@ def finetune_single_iteration(
         args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=dataset["val"],
-        compute_metrics=model.compute_metrics,
+        compute_metrics=partial(model.compute_metrics, dataset=None),
         callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
         data_collator=model.data_collator,
         preprocess_logits_for_metrics=remove_extra_tensors_from_logits,

euroeval/generation.py CHANGED Viewed

@@ -235,7 +235,7 @@ def generate_single_iteration(
         )
     itr_scores: dict[str, float] = model.compute_metrics(
-        model_outputs_and_labels=(all_preds, ground_truth)
+        model_outputs_and_labels=(all_preds, ground_truth), dataset=dataset
     )
     return itr_scores

euroeval/human_evaluation.py CHANGED Viewed

@@ -620,7 +620,8 @@ class HumanEvaluator:
         )
         ground_truth = self.active_dataset["label"]
         itr_scores: dict[str, float] = self.compute_metrics(
-            model_outputs_and_labels=(all_preds, ground_truth)
+            model_outputs_and_labels=(all_preds, ground_truth),
+            dataset=self.active_dataset,
         )
         # We reverse the order, as the Info messages are printed in reverse order

euroeval/metrics.py CHANGED Viewed

@@ -14,6 +14,7 @@ from .exceptions import InvalidBenchmark
 from .utils import HiddenPrints
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from evaluate import EvaluationModule
 logger = logging.getLogger(__name__)
@@ -49,7 +50,9 @@ class Metric(abc.ABC):
         )
     @abc.abstractmethod
-    def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
+    def __call__(
+        self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
+    ) -> float | None:
         """Calculate the metric score.
         Args:
@@ -57,6 +60,9 @@ class Metric(abc.ABC):
                 The model predictions.
             references:
                 The ground truth references.
+            dataset:
+                The dataset used for evaluation. This is only used in case any
+                additional metadata is used to compute the metrics.
         Returns:
             The calculated metric score, or None if the score should be ignored.
@@ -125,7 +131,9 @@ class HuggingFaceMetric(Metric):
         )
         self.metric: "EvaluationModule | None" = None
-    def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
+    def __call__(
+        self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
+    ) -> float | None:
         """Calculate the metric score.
         Args:
@@ -133,6 +141,9 @@ class HuggingFaceMetric(Metric):
                 The model predictions.
             references:
                 The ground truth references.
+            dataset:
+                The dataset used for evaluation. This is only used in case any
+                additional metadata is used to compute the metrics.
         Returns:
             The calculated metric score, or None if the score should be ignored.
@@ -213,7 +224,9 @@ class LLMAsAJudgeMetric(Metric):
         self.condition_formatting_fn = condition_formatting_fn
         self.system_prompt = system_prompt
-    def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
+    def __call__(
+        self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
+    ) -> float | None:
         """Calculate the metric score using the judge model.
         Args:
@@ -221,6 +234,9 @@ class LLMAsAJudgeMetric(Metric):
                 The model predictions.
             references:
                 The ground truth references.
+            dataset:
+                The dataset used for evaluation. This is only used in case any
+                additional metadata is used to compute the metrics.
         Returns:
             The calculated metric score, or None if the score should be ignored.
@@ -343,7 +359,9 @@ class SpeedMetric(Metric):
             postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
         )
-    def __call__(self, _: t.Sequence, __: t.Sequence) -> float | None:
+    def __call__(
+        self, _: t.Sequence, __: t.Sequence, ___: "Dataset | None"
+    ) -> float | None:
         """Not used with the speed metric, but required for consistency."""
         raise NotImplementedError

euroeval/prompt_templates/multiple_choice.py CHANGED Viewed

@@ -69,7 +69,7 @@ MULTIPLE_CHOICE_TEMPLATES = {
     IT: PromptConfig(
         default_prompt_prefix="Le seguenti sono domande a scelta multipla "
         "(con relative risposte).",
-        default_prompt_template="Domanda: {text}\nRéponse: {label}",
+        default_prompt_template="Domanda: {text}\nRisposta: {label}",
         default_instruction_prompt="Domanda: {text}\n\nRispondete alla domanda "
         "precedente con {labels_str}, e nient'altro.",
         default_prompt_label_mapping="auto",

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -149,6 +149,7 @@ class QuestionAnsweringTrainer(Trainer):
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
+    dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -158,6 +159,9 @@ def compute_metrics(
             contains the true labels.
         dataset_config:
             The configuration of the dataset.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -181,7 +185,9 @@ def compute_metrics(
     results: dict[str, float] = dict()
     for metric in dataset_config.task.metrics:
-        score: float | None = metric(predictions=predictions, references=labels)
+        score: float | None = metric(
+            predictions=predictions, references=labels, dataset=dataset
+        )
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -11,6 +11,7 @@ from ..exceptions import InvalidBenchmark
 from ..utils import log_once, raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from transformers.trainer_utils import EvalPrediction
     from ..data_models import DatasetConfig, GenerativeModelOutput
@@ -23,6 +24,7 @@ logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
+    dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -32,6 +34,9 @@ def compute_metrics(
             contains the true labels.
         dataset_config:
             The configuration of the dataset.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -73,7 +78,9 @@ def compute_metrics(
     results: dict[str, float] = dict()
     for metric in dataset_config.task.metrics:
-        score: float | None = metric(predictions=predictions, references=label_ids)
+        score: float | None = metric(
+            predictions=predictions, references=label_ids, dataset=dataset
+        )
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process

euroeval/task_group_utils/text_to_text.py CHANGED Viewed

@@ -11,6 +11,7 @@ from ..metrics import HuggingFaceMetric
 from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from transformers.trainer_utils import EvalPrediction
     from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
@@ -24,6 +25,7 @@ def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
+    dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -35,6 +37,9 @@ def compute_metrics(
             The configuration of the dataset.
         benchmark_config:
             The configuration of the benchmark.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -69,7 +74,9 @@ def compute_metrics(
         while True:
             try:
-                score: float | None = metric(predictions=predictions, references=labels)
+                score: float | None = metric(
+                    predictions=predictions, references=labels, dataset=dataset
+                )
                 break
             except Exception as e:
                 oom_error = [

euroeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -12,6 +12,7 @@ from ..exceptions import InvalidBenchmark
 from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from transformers.tokenization_utils import PreTrainedTokenizer
     from transformers.tokenization_utils_base import BatchEncoding
     from transformers.trainer_utils import EvalPrediction
@@ -27,6 +28,7 @@ def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     has_misc_tags: bool,
     dataset_config: "DatasetConfig",
+    dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -38,6 +40,9 @@ def compute_metrics(
             Whether the dataset has MISC tags.
         dataset_config:
             The configuration of the dataset.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -136,7 +141,9 @@ def compute_metrics(
             for metric in dataset_config.task.metrics
             if metric.name == "micro_f1"
         )
-        micro_f1_score = metric(predictions=predictions, references=list(labels))
+        micro_f1_score = metric(
+            predictions=predictions, references=list(labels), dataset=dataset
+        )
     # Compute the metrics without MISC tags
     # We manually set the F1 metric to be 100% if both the labels and the models
@@ -158,7 +165,7 @@ def compute_metrics(
             if metric.name == "micro_f1_no_misc"
         )
         micro_f1_no_misc_score = metric(
-            predictions=predictions_no_misc, references=labels_no_misc
+            predictions=predictions_no_misc, references=labels_no_misc, dataset=dataset
         )
     # Raise error if the metrics are invalid

euroeval/types.py CHANGED Viewed

@@ -5,6 +5,7 @@ import typing as t
 from transformers.trainer_utils import EvalPrediction
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from numpy.typing import NDArray
     from .data_models import GenerativeModelOutput
@@ -25,12 +26,16 @@ class ComputeMetricsFunction(t.Protocol):
             "NDArray | list[str] | list[list[str]]",
             "NDArray | list[str] | list[list[str]]",
         ],
+        dataset: "Dataset",
     ) -> dict[str, float]:
         """Compute the metrics.
         Args:
             model_outputs_and_labels:
                 The model outputs and labels.
+            dataset:
+                The dataset used for evaluation. This is only used in case any
+                additional metadata is used to compute the metrics.
         Returns:
             The computed metrics.

{euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.14.0
+Version: 15.16.0
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -56,18 +56,16 @@ Requires-Dist: setuptools>=75.8.2
 Requires-Dist: tenacity>=9.0.0
 Requires-Dist: termcolor>=2.0.0
 Requires-Dist: torch>=2.6.0
-Requires-Dist: transformers>=4.51.0
+Requires-Dist: transformers>=4.55.0
 Provides-Extra: all
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: gradio>=4.26.0; extra == 'all'
-Requires-Dist: outlines>=0.1.11; extra == 'all'
-Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'all'
 Provides-Extra: generative
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
-Requires-Dist: outlines>=0.1.11; extra == 'generative'
-Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'generative'
 Provides-Extra: human-evaluation
 Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
 Provides-Extra: test
@@ -235,6 +233,7 @@ A huge thank you to all the contributors who have helped make this project a suc
 <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
 <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
 <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
+<a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
 ### Contribute to EuroEval

{euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/RECORD RENAMED Viewed

@@ -1,19 +1,19 @@
-euroeval/__init__.py,sha256=fZyR9R3C3vwGJS3CrCJ6ySr_FDnMu_Aqnz0FdadWEEs,3399
+euroeval/__init__.py,sha256=ZZoVc6tKWz_h8Pw2n26PV-q_Gd4TM_02O235ZBRUNJw,3756
 euroeval/benchmark_config_factory.py,sha256=jKC8bEzJSGGCcG8aWsPxiyHX6fjOQYQWvkp1MIUuHYM,11564
-euroeval/benchmarker.py,sha256=SDBzdCa4I8u1XDeN_1mKTFzfaaQbbY_oWcHt3niADxk,48497
+euroeval/benchmarker.py,sha256=6qo0ytRnvZLxTQZvo2Fryox5DFHGrLsa0tVGquLHdTQ,48419
 euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
 euroeval/cli.py,sha256=h81Lswm_q9htkYz-GQQQVIsdsUPnfe3LDH8AZdBcpKs,8602
 euroeval/constants.py,sha256=0KHrH74zGM8vNF4uZG_a5qFJRZH5YgyQULYZtCKlo68,2452
 euroeval/data_loading.py,sha256=DP-cqwN_d0Y-KaN8P8c3fDr6PX80UYROHgRwX82ix4w,4156
-euroeval/data_models.py,sha256=gPHyIoN2A5_O-cJgyb6jhn6enH8zsiIBI09W_wdHMQs,22031
+euroeval/data_models.py,sha256=qSCNq3PV7qo--gibqEvvu4cXkEkhGGAb6UiZW8U_KiU,22031
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
 euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
-euroeval/finetuning.py,sha256=BrPZ-6qFY8K-dwfaRwNetVYfYburoQwLQty6pn6iP_s,11340
-euroeval/generation.py,sha256=1fqFEWwM2RzI3uPZem95VFWbN8EfrKZQTrHEP34ihHs,11622
+euroeval/finetuning.py,sha256=Wzagme1n3lSZLWX0WbKMHtSUlAZr8t8_FJvggDZf72c,11393
+euroeval/generation.py,sha256=lmvu__6w3cLxi0zBtXSlyZvV8CJpV3BdajUoIEA9ElA,11639
 euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
-euroeval/human_evaluation.py,sha256=Jtz3K5Lqne48wPZWf4EAd3d-n_wX27nGJHigjhV1D7s,27537
+euroeval/human_evaluation.py,sha256=FLuTl1DHxCiWB_laVVQHIH86yXvA_ZeNNSrUmyExZXI,27579
 euroeval/languages.py,sha256=cr_Z5jtaHb2XY0zeOhuk3ATHX74PODzt6gMPC2zMD7c,8594
-euroeval/metrics.py,sha256=nxosyoRjlk7TcoAOkjU7zx2TB43b9tA8M1m4V1s5eKU,15516
+euroeval/metrics.py,sha256=m8nVnxUnwmIrlBfW8pkN4FCMjW3Sbg9Iq4oMZFAicEc,16227
 euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
 euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
 euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
@@ -21,43 +21,43 @@ euroeval/scores.py,sha256=TatSbjia7Zwj71gQFyV_gCHyppMbOgeaZgNCib8G86k,2849
 euroeval/speed_benchmark.py,sha256=6bFGeMmtdl_6owkxNQ3ZKiyQQS58k0NApzlsbDgBW5s,4037
 euroeval/tasks.py,sha256=btxf29M5rUP7JjBl6u9aQlHQAxrJNP4bRbdEQtDnmDA,3376
 euroeval/tokenization_utils.py,sha256=LxgGs7juS5PuMYt5LL2X6eVXdtnpi-A2jFxqcWpF6NA,17931
-euroeval/types.py,sha256=EIYMNOqqHqibnbNw-fvdst6HwTvq32gtxhr7jL7i-xM,2511
+euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
 euroeval/utils.py,sha256=5R7y67xe0ODaje7k8nOu2AFS3Ph2gcsiWpIq5rjSSuA,11613
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=D1oKD16KBvxEoBUfqwvzvcDc1hx6letdD3v1PnBmF4A,10669
 euroeval/benchmark_modules/fresh.py,sha256=sg_AXNPApFObCzCRWhCgKxfr-eqQsT6Ri0xx0_Yy5JM,10293
 euroeval/benchmark_modules/hf.py,sha256=-W_bWEdm0zePkn4nDz4l0T4hhJJnlfwHrtIO3m5BrUs,44725
-euroeval/benchmark_modules/litellm.py,sha256=qv-k2ntk48OF4ikevQ95k4zLbBkZYOZ2z-GAisA-tFY,53374
-euroeval/benchmark_modules/vllm.py,sha256=kq3PMUuRT0NOky6XSHl1JeHTDGehwcub0HcGC5S_Wv4,38834
+euroeval/benchmark_modules/litellm.py,sha256=ibdbOmxAO1VsuZX4uUs5MQ8pFPfqPJoleOOjAim3syY,55493
+euroeval/benchmark_modules/vllm.py,sha256=7PhfqqeRGFdzOL-RBJbrHEAMGfwrVWngF14dSeq9IpI,39072
 euroeval/dataset_configs/__init__.py,sha256=EbjEyHwBtSztASl8_xblD8hessruDdV4Eg1vXrmGOuY,1935
-euroeval/dataset_configs/danish.py,sha256=-y-n08hTApwTdSVdjRlZYa3gOX92cTGhg8xsuG-Lhww,3691
-euroeval/dataset_configs/dutch.py,sha256=siyFeEKYx2gBpyqQPtOZ0cD8FTsIMUqzRX5xrQfrNXI,3480
+euroeval/dataset_configs/danish.py,sha256=0lDtvpgszXY1XaPjTU8yA3oNCU8W2OllvrBWgn6pkhk,4027
+euroeval/dataset_configs/dutch.py,sha256=ekZxLL9d09BUMijCxy9EFa2heNQVvySPySOjhWdtJc8,3815
 euroeval/dataset_configs/english.py,sha256=uQAaGWpHk8xqFCeIhmmPXYTb1cZomeEdRaRe9qIZQrg,2858
 euroeval/dataset_configs/faroese.py,sha256=gkgxQTWGFbfg9Eo1z-NSLROgKDcaij9tAN2mfgtrt0M,1647
-euroeval/dataset_configs/finnish.py,sha256=OyveLgyii0hOlo6HZsqAq4rwDrj8tl2qstRfQKugURo,2342
-euroeval/dataset_configs/french.py,sha256=DKKZEtohWkw_ouBaxWcPzp-K6NhQNtvCKxj8NLbIpUc,2678
-euroeval/dataset_configs/german.py,sha256=3bfRgkqIGkAhcw4kwcJN9PKuJSmi1r6AFTJY-IWKgWM,2856
+euroeval/dataset_configs/finnish.py,sha256=UZwy0_d17O2L-v2AKOu3OlDwFPcLGTZNAOt7ZKlr4K8,2679
+euroeval/dataset_configs/french.py,sha256=Hei2M4bGIz8hVtaPKQlQATcmK-0bFBNEocEszR3gia0,3014
+euroeval/dataset_configs/german.py,sha256=sRYtOl6CYf4kZkeINfff6xoKBG4OsDxb2b72lKwELGc,3192
 euroeval/dataset_configs/icelandic.py,sha256=g21IHjcwEZvf_yJ9PobeuBOqRiLOk0oCdEjY34g-UMk,4497
-euroeval/dataset_configs/italian.py,sha256=rHLMkSXT0kFoQlkwHODxO50WBRIfGtkAnW_C-sfIu74,2957
+euroeval/dataset_configs/italian.py,sha256=4SEmdUyfGbbwMPhv_9nL3JNJtoDKHLAlWuvr7Ihmi9o,3294
 euroeval/dataset_configs/norwegian.py,sha256=-WvQM44xCwjrqBzlAy4rjf6v87fGera2JmZV_069TeQ,6003
 euroeval/dataset_configs/portuguese.py,sha256=3SqbwD0PNTILGALzh50pVoEwC-spRD75ZeE2NEj151E,2367
-euroeval/dataset_configs/spanish.py,sha256=VKfBIpBRR38ckuULw7Ftmc-0smsm6GshUAik2-Y1Npw,2855
-euroeval/dataset_configs/swedish.py,sha256=WpExi4TJqy_Ruwy4Kvde94jM605vT_88el_KKUzLV4E,3108
+euroeval/dataset_configs/spanish.py,sha256=Bm0Z19Mh2qYXR0RIRlqEkzfVb5KiqJRectfuY7JLql4,3192
+euroeval/dataset_configs/swedish.py,sha256=js4paNsuC0nQzPpf6_BzHBf7MT60XUpP1-qM2uxRtQs,3445
 euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
 euroeval/prompt_templates/linguistic_acceptability.py,sha256=ZN71BEt4HAhSYY-GWjh-S-iVvq5AODQJThkrjDhy4oM,7138
-euroeval/prompt_templates/multiple_choice.py,sha256=F9ItGQtnaaez15A8MQ1UCpKRDsLM-AZyRdYetGAofa0,5494
+euroeval/prompt_templates/multiple_choice.py,sha256=wHnQCE5bv947L6hSK5zJitE37V-PbuNYAp156mWaIYA,5494
 euroeval/prompt_templates/named_entity_recognition.py,sha256=ga21s9T4_Hhbf88boWm7gnL7OgD7txuS_EeDgXaxEoE,13602
 euroeval/prompt_templates/reading_comprehension.py,sha256=3Nch-9zHfUDIwy-k5mP-TRhHQRQ9nad8HdhpJ1S8nGc,7072
 euroeval/prompt_templates/sentiment_classification.py,sha256=2Xsmj8lbaAXACHhwbbR4dWhoKyKB87TqpMO-ssQ-Djo,7649
 euroeval/prompt_templates/summarization.py,sha256=I98LlUOBVa_xo02npq7BWKKZOXGqm-_15i64QzbEsb0,5334
 euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 euroeval/task_group_utils/multiple_choice_classification.py,sha256=yfy8lczpZ_MY-Y4FQx3Et9vEUpuD3YMFjF3wQGCfMNw,6632
-euroeval/task_group_utils/question_answering.py,sha256=agwtWOmctgat98yqgFiMSPY6zmoaPgYVyzMmOkNjr58,27284
-euroeval/task_group_utils/sequence_classification.py,sha256=igmD24aMNN7QBJ8NDzgEnGwM-jq_zhC37QxazNm7GZ4,12711
-euroeval/task_group_utils/text_to_text.py,sha256=xOpja-W4E-1peMjZX8G-3G5iRgmFHHygrQ5WN1hB3FI,4550
-euroeval/task_group_utils/token_classification.py,sha256=wCy3aI-Sn9f-87tHzAnYDA6EbY3ah3xao1SnfnoRNz4,17490
-euroeval-15.14.0.dist-info/METADATA,sha256=uQY74VCgn3TRCTXJGCb8ilS-3U5UL69lbhNGQw2NGTM,13478
-euroeval-15.14.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.14.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.14.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
-euroeval-15.14.0.dist-info/RECORD,,
+euroeval/task_group_utils/question_answering.py,sha256=6jpiHukzA7IrJh4vVYyZDDyvD5Xc2GsxoXzpm_PHpXw,27503
+euroeval/task_group_utils/sequence_classification.py,sha256=ihJO55f3Dy565d3ByYGMuSINasnjAADaTrM59LwZzA0,12977
+euroeval/task_group_utils/text_to_text.py,sha256=go0y6X9QAv5iywlLAclb8cqFX_3QlAT-1-VNZ9zMWFA,4832
+euroeval/task_group_utils/token_classification.py,sha256=BDqOfopdH5Bbj67HTEbZd9KZtNCDNket8NrCTfxZFzQ,17773
+euroeval-15.16.0.dist-info/METADATA,sha256=_oeIq0ZGzS0i7n51NdhNhuDX2A3_lDjYDD-6KgB1rW0,13536
+euroeval-15.16.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.16.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.16.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
+euroeval-15.16.0.dist-info/RECORD,,

{euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.14.0__py3-none-any.whl → 15.16.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.14.0py3-none-any.whl → 15.16.0py3-none-any.whl