PyPI - EuroEval - Versions diffs - 15.15.0__tar.gz → 15.16.0__tar.gz - Mend

EuroEval 15.15.0tar.gz → 15.16.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (252) hide show

{euroeval-15.15.0 → euroeval-15.16.0}/.github/ISSUE_TEMPLATE/bug.yaml RENAMED Viewed

@@ -55,7 +55,7 @@ body:
   attributes:
     label: EuroEval version
     description: What version of EuroEval are you using?
-    placeholder: Output of `pip list | grep EuroEval`
+    placeholder: Output of `pip list | grep euroeval`
   validations:
     required: true
 - type: input

{euroeval-15.15.0 → euroeval-15.16.0}/.github/workflows/ci.yaml RENAMED Viewed

@@ -57,7 +57,7 @@ jobs:
         run: uv sync --no-dev --extra test
       - name: Start Ollama server
-        run: curl -fsSL https://ollama.com/install.sh | sh
+        run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
       - name: Test with pytest
         run: uv run pytest
@@ -66,6 +66,8 @@ jobs:
           HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+          XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
       - name: Delete EuroEval cache
         run: rm -rf .euroeval_cache
@@ -88,7 +90,7 @@ jobs:
         run: uv sync --no-dev --extra test
       - name: Start Ollama server
-        run: curl -fsSL https://ollama.com/install.sh | sh
+        run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
       - name: Test with pytest
         run: uv run pytest

{euroeval-15.15.0 → euroeval-15.16.0}/.pre-commit-config.yaml RENAMED Viewed

@@ -4,13 +4,13 @@ repos:
     hooks:
       - id: python-use-type-annotations
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
       - id: end-of-file-fixer
       - id: trailing-whitespace
       - id: debug-statements
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.7
+    rev: v0.12.8
     hooks:
       - id: ruff
         args:

{euroeval-15.15.0 → euroeval-15.16.0}/CHANGELOG.md RENAMED Viewed

@@ -10,6 +10,23 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
+## [v15.16.0] - 2025-08-12
+### Added
+- Added metadata for GPT-5 models.
+### Changed
+- Updated `transformers` dependency to `>=4.55.0`.
+### Fixed
+- If the model uses 'mxfp4' quantisation then we allow the dtype to be bfloat16, rather
+  than forcing float16. This caused issues with the new GPT-OSS models.
+- Prevent multiple `Model <model-id> does not exist` logs when evaluating a model
+  that does not exist - now only logs this once.
+- Cleaner error message when attempting to benchmark a generative model without having a
+  GPU available.
+- Now raises error if an inference API is used with a parameter that is not supported.
 ## [v15.15.0] - 2025-08-06
 ### Added
 - Added the common-sense reasoning dataset GoldenSwag for the following

{euroeval-15.15.0 → euroeval-15.16.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.15.0
+Version: 15.16.0
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -56,7 +56,7 @@ Requires-Dist: setuptools>=75.8.2
 Requires-Dist: tenacity>=9.0.0
 Requires-Dist: termcolor>=2.0.0
 Requires-Dist: torch>=2.6.0
-Requires-Dist: transformers>=4.51.0
+Requires-Dist: transformers>=4.55.0
 Provides-Extra: all
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
@@ -233,6 +233,7 @@ A huge thank you to all the contributors who have helped make this project a suc
 <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
 <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
 <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
+<a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
 ### Contribute to EuroEval

{euroeval-15.15.0 → euroeval-15.16.0}/README.md RENAMED Viewed

@@ -159,6 +159,7 @@ A huge thank you to all the contributors who have helped make this project a suc
 <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
 <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
 <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
+<a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
 ### Contribute to EuroEval

{euroeval-15.15.0 → euroeval-15.16.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "EuroEval"
-version = "15.15.0"
+version = "15.16.0"
 description = "The robust European language model benchmark."
 readme = "README.md"
 authors = [
@@ -14,7 +14,7 @@ dependencies = [
     "torch>=2.6.0",
     "pandas>=2.2.0",
     "numpy>=1.23.0,<2.0.0",
-    "transformers>=4.51.0",
+    "transformers>=4.55.0",
     "accelerate>=1.9.0",
     "evaluate>=0.4.1",
     "datasets>=3.5.0",

{euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/litellm.py RENAMED Viewed

@@ -6,7 +6,7 @@ import logging
 import os
 import re
 import typing as t
-from functools import cached_property, partial
+from functools import cache, cached_property, partial
 from time import sleep
 import litellm
@@ -27,6 +27,7 @@ from litellm.exceptions import (
     RateLimitError,
     ServiceUnavailableError,
     Timeout,
+    UnsupportedParamsError,
 )
 from litellm.llms.vertex_ai.common_utils import VertexAIError
 from litellm.router import Router
@@ -87,6 +88,7 @@ logger = logging.getLogger("euroeval")
 VOCAB_SIZE_MAPPING = {
     # OpenAI models
+    r"gpt-5-.*": 100_256,
     r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
     r"gpt-4-[0-9]{4}-preview": 100_256,
     r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
@@ -105,6 +107,7 @@ VOCAB_SIZE_MAPPING = {
 MODEL_MAX_LENGTH_MAPPING = {
     # OpenAI models
+    r"gpt-5-.*": 272_000,
     r"gpt-4(-[0-9]{4})?": 8_191,
     r"gpt-4-32k(-[0-9]{4})?": 32_767,
     r"gpt-4-[0-9]{4}-preview": 128_000,
@@ -129,6 +132,7 @@ MODEL_MAX_LENGTH_MAPPING = {
 NUM_PARAMS_MAPPING = {
     # OpenAI models
+    r"gpt-5-.*": -1,
     r"gpt-4.*": -1,
     r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
     # Anthropic models
@@ -144,6 +148,7 @@ NUM_PARAMS_MAPPING = {
 ALLOWED_PARAMS = {
     # OpenAI models
+    r"gpt-5-.*": ["minimal", "low", "medium", "high"],
     r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
     # Anthropic models
     r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
@@ -269,28 +274,9 @@ class LiteLLMModel(BenchmarkModule):
             generative_type=self.generative_type,
         )
-        # Set the core generation arguments
-        generation_kwargs: dict[str, t.Any] = dict(
-            model=self.model_config.model_id,
-            max_completion_tokens=(
-                REASONING_MAX_TOKENS
-                if self.generative_type == GenerativeType.REASONING
-                else self.dataset_config.max_generated_tokens
-            ),
-            stop=[],
-            temperature=0.0,
-            seed=4242,
-            api_key=self.benchmark_config.api_key,
-            api_base=self.benchmark_config.api_base,
-            api_version=self.benchmark_config.api_version,
-            max_retries=3,
-        )
-        # Set up the `response_format` generation argument if we are dealing with a task
-        # using structured generation
+        # Sanity check that "JSON" is included in the prompt, as some models require
+        # this
         if self.dataset_config.task in TASKS_USING_JSON:
-            # Sanity check that "JSON" is included in the prompt, as some models require
-            # this
             for conversation in conversations:
                 if not conversation:
                     raise InvalidBenchmark(
@@ -310,87 +296,6 @@ class LiteLLMModel(BenchmarkModule):
                     "Prompt must contain 'json' for JSON tasks."
                 )
-            if self.generative_type == GenerativeType.REASONING:
-                log_once(
-                    f"The model {self.model_config.model_id!r} is a reasoning model "
-                    "and thus does not support structured generation, so we do not "
-                    "enable it.",
-                    level=logging.DEBUG,
-                )
-            elif supports_response_schema(model=self.model_config.model_id):
-                ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
-                keys_and_their_types: dict[str, t.Any] = {
-                    tag_name: (conlist(str, max_length=5), ...)
-                    for tag_name in ner_tag_names
-                }
-                pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
-                generation_kwargs["response_format"] = pydantic_class
-                log_once(
-                    "Enabling structured generation for model "
-                    f"{self.model_config.model_id!r} with the JSON schema "
-                    f"{pydantic_class.model_json_schema()}",
-                    level=logging.DEBUG,
-                )
-            else:
-                generation_kwargs["response_format"] = dict(type="json_object")
-                log_once(
-                    "Enabling structured JSON generation for model "
-                    f"{self.model_config.model_id!r} with no custom JSON schema, as "
-                    "the model does not support schemas.",
-                    level=logging.DEBUG,
-                )
-        # If the model is an Ollama reasoning model, we ensure that thinking is enabled
-        if self.is_ollama and self.generative_type == GenerativeType.REASONING:
-            generation_kwargs["think"] = True
-            log_once(
-                "Enabling thinking mode for Ollama model "
-                f"{self.model_config.model_id!r}",
-                level=logging.DEBUG,
-            )
-        # Handle manually set parameters
-        if self.buffer["first_label_token_mapping"]:
-            generation_kwargs["logprobs"] = True
-            generation_kwargs["top_logprobs"] = MAX_LOGPROBS
-        if self.model_config.revision == "thinking":
-            generation_kwargs["thinking"] = dict(
-                type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
-            )
-            log_once(
-                f"Enabling thinking mode for model {self.model_config.model_id!r}",
-                level=logging.DEBUG,
-            )
-        elif self.model_config.revision == "no-thinking":
-            generation_kwargs["thinking"] = dict(budget_tokens=0)
-            log_once(
-                f"Disabling thinking mode for model {self.model_config.model_id!r}",
-                level=logging.DEBUG,
-            )
-        elif self.model_config.revision in {"low", "medium", "high"}:
-            generation_kwargs["reasoning_effort"] = self.model_config.revision
-            log_once(
-                f"Enabling reasoning effort {self.model_config.revision!r} for model "
-                f"{self.model_config.model_id!r}",
-                level=logging.DEBUG,
-            )
-        # Drop generation kwargs that are not supported by the model
-        litellm.drop_params = True
-        # First attempt is a test run with a single conversation to handle errors
-        # quickly
-        test_conversation = conversations[0]
-        _, failures = safe_run(
-            self._generate_async(
-                model_id=self.model_config.model_id,
-                conversations=[test_conversation],
-                **generation_kwargs,
-            )
-        )
-        for _, error in failures:
-            self._handle_exception(error=error, generation_kwargs=generation_kwargs)
         all_responses: dict[int, "ModelResponse"] = {}
         conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
             enumerate(conversations)
@@ -404,7 +309,7 @@ class LiteLLMModel(BenchmarkModule):
                 self._generate_async(
                     model_id=self.model_config.model_id,
                     conversations=list(batch_conversations),
-                    **generation_kwargs,
+                    **self.get_generation_kwargs(dataset_config=self.dataset_config),
                 )
             )
@@ -431,7 +336,12 @@ class LiteLLMModel(BenchmarkModule):
             # Attempt to handle the exceptions, to improve the chance of getting
             # successful generations next time around
             for _, error in failures:
-                self._handle_exception(error=error, generation_kwargs=generation_kwargs)
+                self._handle_exception(
+                    error=error,
+                    generation_kwargs=self.get_generation_kwargs(
+                        dataset_config=self.dataset_config
+                    ),
+                )
             # Sleep for a second to avoid pinging the API server too quickly
             sleep(1)
@@ -484,6 +394,7 @@ class LiteLLMModel(BenchmarkModule):
             "`temperature` may only be set to 1",
             "'temperature' does not support 0.0 with this model. Only the default "
             "(1) value is supported",
+            "Only temperature=1 is supported",
         ]
         max_items_messages = ["'maxItems' is not permitted."]
         no_json_schema_messages = ["Property keys should match pattern"]
@@ -593,6 +504,20 @@ class LiteLLMModel(BenchmarkModule):
             )
             sleep(5)
             return
+        elif isinstance(error, UnsupportedParamsError):
+            unsupported_param_match = re.search(
+                pattern=r"(?<=does not support parameters\: \[')([^ ']+)(?='\])",
+                string=error.message,
+            )
+            if unsupported_param_match is None:
+                raise InvalidModel(error.message)
+            else:
+                unsupported_param = unsupported_param_match.group(0)
+                raise InvalidModel(
+                    f"The model {model_id!r} does not support the parameter "
+                    f"{unsupported_param!r}. Try again without this parameter. "
+                    "Skipping this model."
+                )
         elif isinstance(error, (APIConnectionError, OSError)):
             # If there are too many I/O connections, we increase the number of allowed
             # file descriptors
@@ -1233,6 +1158,126 @@ class LiteLLMModel(BenchmarkModule):
         return dataset
+    @cache
+    def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
+        """Get the generation arguments for the model.
+        Args:
+            dataset_config:
+                The dataset configuration, which is used to determine the generative
+                type of the model. We use this as an argument here rather than using
+                `self.dataset_config` to ensure that that the cache is updated when the
+                dataset configuration changes.
+        Returns:
+            The generation arguments for the model.
+        """
+        # Set the core generation arguments
+        generation_kwargs: dict[str, t.Any] = dict(
+            model=self.model_config.model_id,
+            max_completion_tokens=(
+                REASONING_MAX_TOKENS
+                if self.generative_type == GenerativeType.REASONING
+                else dataset_config.max_generated_tokens
+            ),
+            stop=[],
+            temperature=0.0,
+            seed=4242,
+            api_key=self.benchmark_config.api_key,
+            api_base=self.benchmark_config.api_base,
+            api_version=self.benchmark_config.api_version,
+            max_retries=3,
+        )
+        # Set up the `response_format` generation argument if we are dealing with a task
+        # using structured generation
+        if dataset_config.task in TASKS_USING_JSON:
+            if self.generative_type == GenerativeType.REASONING:
+                log_once(
+                    f"The model {self.model_config.model_id!r} is a reasoning model "
+                    "and thus does not support structured generation, so we do not "
+                    "enable it.",
+                    level=logging.DEBUG,
+                )
+            elif supports_response_schema(model=self.model_config.model_id):
+                ner_tag_names = list(dataset_config.prompt_label_mapping.values())
+                keys_and_their_types: dict[str, t.Any] = {
+                    tag_name: (conlist(str, max_length=5), ...)
+                    for tag_name in ner_tag_names
+                }
+                pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
+                generation_kwargs["response_format"] = pydantic_class
+                log_once(
+                    "Enabling structured generation for model "
+                    f"{self.model_config.model_id!r} with the JSON schema "
+                    f"{pydantic_class.model_json_schema()}",
+                    level=logging.DEBUG,
+                )
+            else:
+                generation_kwargs["response_format"] = dict(type="json_object")
+                log_once(
+                    "Enabling structured JSON generation for model "
+                    f"{self.model_config.model_id!r} with no custom JSON schema, as "
+                    "the model does not support schemas.",
+                    level=logging.DEBUG,
+                )
+        # If the model is an Ollama reasoning model, we ensure that thinking is enabled
+        if self.is_ollama and self.generative_type == GenerativeType.REASONING:
+            generation_kwargs["think"] = True
+            log_once(
+                "Enabling thinking mode for Ollama model "
+                f"{self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        # Handle manually set parameters
+        if self.buffer["first_label_token_mapping"]:
+            generation_kwargs["logprobs"] = True
+            generation_kwargs["top_logprobs"] = MAX_LOGPROBS
+        if self.model_config.revision == "thinking":
+            generation_kwargs["thinking"] = dict(
+                type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
+            )
+            log_once(
+                f"Enabling thinking mode for model {self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        elif self.model_config.revision == "no-thinking":
+            generation_kwargs["thinking"] = dict(budget_tokens=0)
+            log_once(
+                f"Disabling thinking mode for model {self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        elif self.model_config.revision in {"minimal", "low", "medium", "high"}:
+            generation_kwargs["reasoning_effort"] = self.model_config.revision
+            log_once(
+                f"Enabling reasoning effort {self.model_config.revision!r} for model "
+                f"{self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        # First attempt is a test run with a single conversation to handle errors
+        # quickly. We repeat this multiple times to deal with different types of
+        # errors, and stop if we get a successful response.
+        test_conversation = [
+            litellm.ChatCompletionUserMessage(role="user", content="Test message")
+        ]
+        for _ in range(5):
+            _, failures = safe_run(
+                self._generate_async(
+                    model_id=self.model_config.model_id,
+                    conversations=[test_conversation],
+                    **generation_kwargs,
+                )
+            )
+            if not failures:
+                break
+            for _, error in failures:
+                self._handle_exception(error=error, generation_kwargs=generation_kwargs)
+        return generation_kwargs
 def raise_if_wrong_params(
     model_config: ModelConfig, allowed_params: dict[str, list[str]]
@@ -1264,6 +1309,11 @@ def raise_if_wrong_params(
                     msg += " No parameters are allowed."
                 raise InvalidModel(msg)
             return
+    else:
+        raise InvalidModel(
+            f"The parameter {param!r} is not supported for the model "
+            f"{model_config.model_id!r}."
+        )
 def try_download_ollama_model(model_id: str) -> bool:

{euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/vllm.py RENAMED Viewed

@@ -168,7 +168,8 @@ class VLLMModel(HuggingFaceEncoderModel):
     def __del__(self) -> None:
         """Clean up the model and tokenizer."""
-        clear_vllm()
+        if importlib.util.find_spec("vllm") is not None:
+            clear_vllm()
         if hasattr(self, "_model"):
             del self._model
         if hasattr(self, "_tokenizer"):
@@ -690,8 +691,14 @@ def load_model_and_tokenizer(
             )
             dtype = torch.float16
-    # If the model is a quantized model, we need to set the dtype to float16
-    if quantization is not None and hf_model_config.torch_dtype != torch.float16:
+    # If the model is a quantized model, we might need to change the dtype
+    if quantization == "mxfp4" and hf_model_config.torch_dtype is None:
+        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        logger.debug(
+            "You are loading a quantized model where `torch_dtype` has not been set. "
+            f"Setting dtype to {dtype!r}."
+        )
+    elif quantization is not None and hf_model_config.torch_dtype != torch.float16:
         logger.info(
             "You are loading a quantized model with dtype "
             f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting "

{euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmarker.py RENAMED Viewed

@@ -379,7 +379,16 @@ class Benchmarker:
         current_benchmark_results: list[BenchmarkResult] = list()
         for model_id in model_ids:
-            model_config: ModelConfig | None = None
+            # Load the model configuration, or skip the model if it is invalid
+            try:
+                model_config = get_model_config(
+                    model_id=model_id, benchmark_config=benchmark_config
+                )
+            except InvalidModel as e:
+                logger.info(e.message)
+                num_finished_benchmarks += len(dataset_configs)
+                continue
             loaded_model: BenchmarkModule | None = None
             for dataset_config in dataset_configs:
                 # Skip if we have already benchmarked this model on this dataset and
@@ -399,16 +408,6 @@ class Benchmarker:
                     num_finished_benchmarks += 1
                     continue
-                if model_config is None:
-                    try:
-                        model_config = get_model_config(
-                            model_id=model_id, benchmark_config=benchmark_config
-                        )
-                    except InvalidModel as e:
-                        logger.info(e.message)
-                        num_finished_benchmarks += len(dataset_configs)
-                        continue
                 # Skip if the model is an encoder model and the task is generative
                 task_is_generative = (
                     dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS

{euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/finetuning.py RENAMED Viewed

@@ -3,6 +3,7 @@
 import logging
 import sys
 import typing as t
+from functools import partial
 import torch
 from tqdm.auto import tqdm
@@ -198,7 +199,7 @@ def finetune_single_iteration(
         args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=dataset["val"],
-        compute_metrics=model.compute_metrics,
+        compute_metrics=partial(model.compute_metrics, dataset=None),
         callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
         data_collator=model.data_collator,
         preprocess_logits_for_metrics=remove_extra_tensors_from_logits,

{euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/metrics.py RENAMED Viewed

@@ -51,7 +51,7 @@ class Metric(abc.ABC):
     @abc.abstractmethod
     def __call__(
-        self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
+        self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
     ) -> float | None:
         """Calculate the metric score.
@@ -132,7 +132,7 @@ class HuggingFaceMetric(Metric):
         self.metric: "EvaluationModule | None" = None
     def __call__(
-        self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
+        self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
     ) -> float | None:
         """Calculate the metric score.
@@ -225,7 +225,7 @@ class LLMAsAJudgeMetric(Metric):
         self.system_prompt = system_prompt
     def __call__(
-        self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
+        self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
     ) -> float | None:
         """Calculate the metric score using the judge model.
@@ -359,7 +359,9 @@ class SpeedMetric(Metric):
             postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
         )
-    def __call__(self, _: t.Sequence, __: t.Sequence, ___: "Dataset") -> float | None:
+    def __call__(
+        self, _: t.Sequence, __: t.Sequence, ___: "Dataset | None"
+    ) -> float | None:
         """Not used with the speed metric, but required for consistency."""
         raise NotImplementedError

{euroeval-15.15.0 → euroeval-15.16.0}/uv.lock RENAMED Viewed

@@ -1123,7 +1123,7 @@ wheels = [
 [[package]]
 name = "euroeval"
-version = "15.15.0"
+version = "15.16.0"
 source = { editable = "." }
 dependencies = [
     { name = "accelerate" },
@@ -1246,7 +1246,7 @@ requires-dist = [
     { name = "tenacity", specifier = ">=9.0.0" },
     { name = "termcolor", specifier = ">=2.0.0" },
     { name = "torch", specifier = ">=2.6.0" },
-    { name = "transformers", specifier = ">=4.51.0" },
+    { name = "transformers", specifier = ">=4.55.0" },
     { name = "vllm", marker = "sys_platform == 'linux' and extra == 'all'", specifier = ">=0.10.0" },
     { name = "vllm", marker = "sys_platform == 'linux' and extra == 'generative'", specifier = ">=0.10.0" },
 ]
@@ -5376,7 +5376,7 @@ wheels = [
 [[package]]
 name = "transformers"
-version = "4.54.1"
+version = "4.55.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -5390,9 +5390,9 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/21/6c/4caeb57926f91d943f309b062e22ad1eb24a9f530421c5a65c1d89378a7a/transformers-4.54.1.tar.gz", hash = "sha256:b2551bb97903f13bd90c9467d0a144d41ca4d142defc044a99502bb77c5c1052", size = 9514288, upload-time = "2025-07-29T15:57:22.826Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/27/5d/f7dc746eef83336a6b34197311fe0c1da0d1192f637c726c6a5cf0d83502/transformers-4.55.0.tar.gz", hash = "sha256:15aa138a05d07a15b30d191ea2c45e23061ebf9fcc928a1318e03fe2234f3ae1", size = 9569089, upload-time = "2025-08-05T16:13:48.997Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/cf/18/eb7578f84ef5a080d4e5ca9bc4f7c68e7aa9c1e464f1b3d3001e4c642fce/transformers-4.54.1-py3-none-any.whl", hash = "sha256:c89965a4f62a0d07009d45927a9c6372848a02ab9ead9c318c3d082708bab529", size = 11176397, upload-time = "2025-07-29T15:57:19.692Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/93/bcb22fb52ed65084c0199270832aa4cdd4b41296d896f3e7ade188bccb68/transformers-4.55.0-py3-none-any.whl", hash = "sha256:29d9b8800e32a4a831bb16efb5f762f6a9742fef9fce5d693ed018d19b106490", size = 11267905, upload-time = "2025-08-05T16:13:34.814Z" },
 ]
 [[package]]