PyPI - EuroEval - Versions diffs - 15.4.2__py3-none-any.whl → 15.5.0__py3-none-any.whl - Mend

EuroEval 15.4.2py3-none-any.whl → 15.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (15) hide show

euroeval/__init__.py +2 -2
euroeval/benchmark_modules/hf.py +11 -2
euroeval/benchmark_modules/litellm.py +204 -74
euroeval/benchmark_modules/vllm.py +59 -34
euroeval/benchmarker.py +35 -6
euroeval/constants.py +8 -1
euroeval/data_models.py +1 -2
euroeval/dataset_configs.py +1 -1
euroeval/task_utils/sequence_classification.py +44 -9
euroeval/utils.py +100 -4
{euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/METADATA +5 -4
{euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/RECORD +15 -15
{euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/WHEEL +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@
 ### Block unwanted terminal output that happens on importing external modules ###
 import logging
+import os
 import sys
 import warnings
@@ -14,7 +15,7 @@ warnings.filterwarnings("ignore", category=UserWarning)
 logging.getLogger("httpx").setLevel(logging.CRITICAL)
 logging.getLogger("datasets").setLevel(logging.CRITICAL)
 logging.getLogger("vllm").setLevel(logging.CRITICAL)
-logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
+os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
 # Set up logging
 fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
@@ -29,7 +30,6 @@ logging.basicConfig(
 ### Set the rest up ###
 import importlib.metadata  # noqa: E402
-import os  # noqa: E402
 from dotenv import load_dotenv  # noqa: E402

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -44,6 +44,7 @@ from ..constants import (
     DUMMY_FILL_VALUE,
     GENERATIVE_PIPELINE_TAGS,
     LOCAL_MODELS_REQUIRED_FILES,
+    MAX_CONTEXT_LENGTH,
     MERGE_TAGS,
 )
 from ..data_models import BenchmarkConfig, DatasetConfig, HFModelInfo, ModelConfig, Task
@@ -245,6 +246,15 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             max_length for max_length in all_max_lengths if max_length >= 128
         ]
+        # We remove the upper cap of maximum context length for the model, as it is
+        # highly unlikely that this is the model's actual maximum context length - we
+        # would rather not report a value than report an incorrect one.
+        all_max_lengths = [
+            max_length
+            for max_length in all_max_lengths
+            if max_length != MAX_CONTEXT_LENGTH
+        ]
         if len(list(all_max_lengths)) > 0:
             model_max_length = min(list(all_max_lengths))
         else:
@@ -1140,8 +1150,7 @@ def align_model_and_tokenizer(
     Returns:
         The fixed model and tokenizer.
     """
-    # Ensure that the model max length is at most 5,000, to avoid OOM errors
-    model_max_length = min(model_max_length, 5_000)
+    model_max_length = min(model_max_length, MAX_CONTEXT_LENGTH)
     if model_max_length > 0:
         tokenizer.model_max_length = model_max_length

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -27,20 +27,17 @@ from litellm.exceptions import (
     BadRequestError,
     InternalServerError,
     NotFoundError,
+    RateLimitError,
     ServiceUnavailableError,
     Timeout,
 )
+from litellm.llms.vertex_ai.common_utils import VertexAIError
 from litellm.types.utils import ModelResponse
 from requests.exceptions import RequestException
 from tqdm.auto import tqdm
 from transformers import Trainer
-from ..constants import (
-    MAX_LOGPROBS,
-    REASONING_MAX_TOKENS,
-    TASK_GROUPS_USING_LOGPROBS,
-    TASKS_USING_JSON,
-)
+from ..constants import MAX_LOGPROBS, REASONING_MAX_TOKENS, TASKS_USING_JSON
 from ..data_models import (
     BenchmarkConfig,
     DatasetConfig,
@@ -69,7 +66,7 @@ from ..task_utils import (
     token_classification,
 )
 from ..types import ExtractLabelsFunction
-from ..utils import create_model_cache_dir, log_once
+from ..utils import create_model_cache_dir, get_first_label_token_mapping, log_once
 from .base import BenchmarkModule
 from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
@@ -78,64 +75,80 @@ logger = logging.getLogger("euroeval")
 VOCAB_SIZE_MAPPING = {
     # OpenAI models
-    "(text-)?(ada|babbage|curie|davinci)(-001)?": 50_257,
-    "(code|text)-davinci-00[2-9]": 50_281,
-    "gpt-3.5-turbo(-16k)?(-[0-9]{4})?": 100_256,
-    "gpt-4-(32k)?(-[0-9]{4})?": 100_256,
-    "gpt-4-[0-9]{4}-preview": 100_256,
-    "gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
-    "gpt-4-(vision|turbo)(-preview)?": 100_256,
-    "gpt-3.5-turbo-instruct(-[0-9]{4})?": 100_256,
-    "gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_019,
-    "o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
+    r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
+    r"gpt-4-[0-9]{4}-preview": 100_256,
+    r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
+    r"gpt-4-(vision|turbo)(-preview)?": 100_256,
+    r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 100_256,
+    r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_019,
+    r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
     # Anthropic models
-    "claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
+    r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
+    # Gemini models
+    r"(gemini/)?gemini-[1-9]\.[0-9]-(flash|pro).*": 256_128,
+    # xAI models
+    r"(xai/)?grok.*": -1,
 }
 MODEL_MAX_LENGTH_MAPPING = {
     # OpenAI models
-    "(text-)?(ada|babbage|curie|davinci)(-001)?": 2_050,
-    "text-davinci-00[2-9]": 4_098,
-    "code-davinci-00[1-9]": 8_002,
-    "gpt-3.5-turbo-0613": 4_096,
-    "gpt-3.5-turbo(-[0-9]{4})?": 16_385,
-    "gpt-3.5-turbo-16k(-[0-9]{4})?": 16_384,
-    "gpt-4(-[0-9]{4})?": 8_191,
-    "gpt-4-32k(-[0-9]{4})?": 32_767,
-    "gpt-4-[0-9]{4}-preview": 128_000,
-    "gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
-    "gpt-4-(vision|turbo)(-preview)?": 128_000,
-    "gpt-3.5-turbo-instruct(-[0-9]{4})?": 4_095,
-    "gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
-    "o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
-    "o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
-    "o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
+    r"gpt-4(-[0-9]{4})?": 8_191,
+    r"gpt-4-32k(-[0-9]{4})?": 32_767,
+    r"gpt-4-[0-9]{4}-preview": 128_000,
+    r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
+    r"gpt-4-(vision|turbo)(-preview)?": 128_000,
+    r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 4_095,
+    r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
+    r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
+    r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
+    r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
     # Anthropic models
-    "claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
+    r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
+    # Gemini models
+    r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
+    r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
+    r"(gemini/)?gemini-2\.(0|5).*": 1_048_576,
+    # xAI models
+    r"(xai/)?grok.*": 131_072,
 }
 NUM_PARAMS_MAPPING = {
     # OpenAI models
-    "(text-)?ada(-001)?": 350_000_000,
-    "(text-)?babbage(-001)?": 3_000_000_000,
-    "(text-)?curie(-001)?": 13_000_000_000,
-    "((text|code)-)?davinci(-00[1-9])?": 175_000_000_000,
-    "gpt-(3.5|4)-turbo-((16|32)k)?(-[0-9]{4})?": -1,
-    "gpt-4-[0-9]{4}-preview": -1,
-    "gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
-    "gpt-4-(vision|turbo)(-preview)?": -1,
-    "gpt-3.5-turbo-instruct(-[0-9]{4})?": -1,
-    "gpt-4o(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
-    "gpt-4o-mini(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
-    "o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
+    r"gpt-4.*": -1,
+    r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
     # Anthropic models
-    "claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
+    r"(anthropic/)?claude-*": -1,
+    # Gemini models
+    r"(gemini/)?gemini-1.5-flash-8b": 8_000_000_000,
+    r"(gemini/)?gemini-1.5-flash-[0-9]+": -1,
+    r"(gemini/)?gemini-2.(0|5).*": -1,
+    # xAI models
+    r"(xai/)?grok.*": -1,
 }
-REASONING_MODELS = ["o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?"]
+ALLOWED_PARAMS = {
+    # OpenAI models
+    r"gpt-4.*": [],
+    r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "high"],
+    # Anthropic models
+    r"(anthropic/)?claude-3-.*": [],
+    r"(anthropic/)?claude-3.5-.*": [],
+    r"(anthropic/)?claude-3.7-sonnet.*": ["thinking"],
+    # Gemini models
+    r"(gemini/)?gemini-.*": [],
+    # xAI models
+    r"(xai/)?grok.*": [],
+}
+REASONING_MODELS = [
+    r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?",
+    r"(gemini/)?gemini.*thinking.*",
+    r"(gemini/)?gemini-2.5-pro.*",
+]
 class LiteLLMModel(BenchmarkModule):
@@ -167,12 +180,18 @@ class LiteLLMModel(BenchmarkModule):
             "ollama/"
         ) or model_config.model_id.startswith("ollama_chat/")
+        raise_if_wrong_params(model_config=model_config, allowed_params=ALLOWED_PARAMS)
         super().__init__(
             model_config=model_config,
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
         )
+        self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
+            dataset_config=self.dataset_config, tokenizer=None
+        )
     @property
     def generative_type(self) -> GenerativeType | None:
         """Get the generative type of the model.
@@ -180,7 +199,9 @@ class LiteLLMModel(BenchmarkModule):
         Returns:
             The generative type of the model, or None if it has not been set yet.
         """
-        if re.fullmatch(
+        if self.model_config.revision == "thinking":
+            return GenerativeType.REASONING
+        elif re.fullmatch(
             pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
         ):
             return GenerativeType.REASONING
@@ -218,7 +239,13 @@ class LiteLLMModel(BenchmarkModule):
             api_version=self.benchmark_config.api_version,
         )
-        if self.dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS:
+        # Get the mapping from labels to the first token in the label. We call this each
+        # time we generate a new dataset since the dataset config can change
+        self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
+            dataset_config=self.dataset_config, tokenizer=None
+        )
+        if self.buffer["first_label_token_mapping"]:
             generation_kwargs["logprobs"] = True
             generation_kwargs["top_logprobs"] = MAX_LOGPROBS
@@ -227,6 +254,27 @@ class LiteLLMModel(BenchmarkModule):
                 "Prompt must contain 'json' for JSON tasks."
             )
             generation_kwargs["response_format"] = dict(type="json_object")
+            log_once(
+                "Enabling JSON response format for model "
+                f"{self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        if self.model_config.revision == "thinking":
+            generation_kwargs["thinking"] = dict(
+                type="enabled", budget_tokens=REASONING_MAX_TOKENS
+            )
+            log_once(
+                f"Enabling thinking mode for model {self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        elif self.model_config.revision in {"low", "high"}:
+            generation_kwargs["reasoning_effort"] = self.model_config.revision
+            log_once(
+                f"Enabling reasoning effort {self.model_config.revision!r} for model "
+                f"{self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
         # This drops generation kwargs that are not supported by the model
         litellm.drop_params = True
@@ -235,39 +283,60 @@ class LiteLLMModel(BenchmarkModule):
         # handle using newlines as stop sequences, so we try both.
         num_attempts = 10
         for _ in range(num_attempts):
+            stop_messages = ["stop_sequences"]
+            logprobs_messages = [
+                "you are not allowed to request logprobs",
+                "you've reached the maximum number of requests with logprobs",
+                "logprobs is not supported",
+                "logprobs is not enabled",
+            ]
+            temperature_messages = [
+                "'temperature' is not supported with this model.",
+                "temperature is not supported with this model",
+            ]
             try:
                 model_response = litellm.completion(
                     messages=messages, max_retries=3, **generation_kwargs
                 )
                 break
-            except BadRequestError as e:
-                if "stop_sequences" in str(e).lower():
+            except (BadRequestError, RateLimitError) as e:
+                if any(msg.lower() in str(e).lower() for msg in stop_messages):
                     generation_kwargs["stop"] = None
-                elif "you are not allowed to request logprobs" in str(e).lower():
-                    generation_kwargs.pop("logprobs")
-                    generation_kwargs.pop("top_logprobs")
                 elif (
-                    "'temperature' is not supported with this model." in str(e).lower()
+                    any(msg.lower() in str(e).lower() for msg in logprobs_messages)
+                    # Special case for Vertex AI models, since they have strict rate
+                    # limits on using logprobs. They also have a cap of 5 logprobs, but
+                    # we ignore this since the rate limiting makes it unusable anyway.
+                    or (isinstance(e, VertexAIError) and "logprobs" in str(e).lower())
                 ):
+                    generation_kwargs.pop("logprobs")
+                    generation_kwargs.pop("top_logprobs")
+                elif any(msg.lower() in str(e).lower() for msg in temperature_messages):
                     generation_kwargs.pop("temperature")
+                elif isinstance(e, RateLimitError):
+                    raise InvalidModel(
+                        "You have encountered your rate limit for model "
+                        f"{self.model_config.model_id!r}. The error message was: {e}"
+                    )
                 else:
                     raise InvalidBenchmark(
                         f"Failed to generate text. The error message was: {e}"
                     )
+            except APIError as e:
+                raise InvalidBenchmark(
+                    f"Failed to generate text. The error message was: {e}"
+                )
             except (
+                APIConnectionError,
                 Timeout,
                 ServiceUnavailableError,
-                APIConnectionError,
                 InternalServerError,
-            ):
+            ) as e:
                 logger.debug(
-                    "Service temporarily unavailable. Retrying in 5 seconds..."
+                    f"Service temporarily unavailable. The error message was: {e}. "
+                    f"Retrying in 5 seconds..."
                 )
                 sleep(5)
-            except APIError as e:
-                raise InvalidBenchmark(
-                    f"Failed to generate text. The error message was: {e}"
-                )
             except AuthenticationError:
                 raise NeedsAdditionalArgument(
                     cli_argument="--api-key",
@@ -280,6 +349,15 @@ class LiteLLMModel(BenchmarkModule):
             )
         assert isinstance(model_response, ModelResponse)
+        if not model_response.choices:
+            # This happens for reasoning models, when they don't finish thinking and run
+            # out of tokens. Happens quite rarely, but we need to handle it.
+            logger.warning(
+                f"The model {self.model_config.model_id!r} did not end up generating "
+                "any text. This is likely because the model ran out of tokens while "
+                "reasoning. Returning an empty string."
+            )
+            return GenerativeModelOutput(sequences=[""])
         model_response_choices = model_response.choices[0]
         assert isinstance(model_response_choices, litellm.Choices)
         generation_output = model_response_choices.message["content"] or ""
@@ -314,7 +392,7 @@ class LiteLLMModel(BenchmarkModule):
         # If it is an Ollama model then we can get the number of parameters from the
         # Ollama Python SDK
         if self.is_ollama:
-            ollama_model_id = self.model_config.model_id.split("/")[-1]
+            ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
             model_info = ollama.show(ollama_model_id).modelinfo
             if model_info is not None:
                 num_params = model_info.get("general.parameter_count")
@@ -334,7 +412,7 @@ class LiteLLMModel(BenchmarkModule):
                     num_labels=self.dataset_config.num_labels,
                     id2label=self.dataset_config.id2label,
                     label2id=self.dataset_config.label2id,
-                    revision=self.model_config.revision,
+                    revision="main",
                     model_cache_dir=self.model_config.model_cache_dir,
                     api_key=self.benchmark_config.api_key,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -345,7 +423,7 @@ class LiteLLMModel(BenchmarkModule):
                 try:
                     repo_info = hf_api.model_info(
                         repo_id=model_id,
-                        revision=self.model_config.revision,
+                        revision="main",
                         token=os.getenv("HUGGINGFACE_API_KEY")
                         or self.benchmark_config.api_key
                         or True,
@@ -398,7 +476,7 @@ class LiteLLMModel(BenchmarkModule):
                     num_labels=self.dataset_config.num_labels,
                     id2label=self.dataset_config.id2label,
                     label2id=self.dataset_config.label2id,
-                    revision=self.model_config.revision,
+                    revision="main",
                     model_cache_dir=self.model_config.model_cache_dir,
                     api_key=self.benchmark_config.api_key,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -442,7 +520,7 @@ class LiteLLMModel(BenchmarkModule):
         # If it is an Ollama model then we can get the maximum length from the Ollama
         # Python SDK
         if self.is_ollama:
-            ollama_model_id = self.model_config.model_id.split("/")[-1]
+            ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
             model_info = ollama.show(ollama_model_id).modelinfo
             if model_info is not None:
                 context_length_keys = [
@@ -478,7 +556,7 @@ class LiteLLMModel(BenchmarkModule):
                     num_labels=self.dataset_config.num_labels,
                     id2label=self.dataset_config.id2label,
                     label2id=self.dataset_config.label2id,
-                    revision=self.model_config.revision,
+                    revision="main",
                     model_cache_dir=self.model_config.model_cache_dir,
                     api_key=self.benchmark_config.api_key,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -563,6 +641,7 @@ class LiteLLMModel(BenchmarkModule):
                 return partial(
                     sequence_classification.extract_labels_from_generation,
                     dataset_config=self.dataset_config,
+                    first_label_token_mapping=self.buffer["first_label_token_mapping"],
                 )
             case TaskGroup.TEXT_TO_TEXT:
                 return text_to_text.extract_labels_from_generation
@@ -605,12 +684,13 @@ class LiteLLMModel(BenchmarkModule):
             Whether the model exists, or an error describing why we cannot check
             whether the model exists.
         """
+        model_id, _ = model_id.split("@") if "@" in model_id else (model_id, "main")
         if model_id in litellm.model_list:
             return True
         # If it is an Ollama model then try to download it
         if model_id.startswith("ollama/") or model_id.startswith("ollama_chat/"):
-            ollama_model_id = model_id.split("/")[-1]
+            ollama_model_id = "/".join(model_id.split("/")[1:])
             downloaded_ollama_models: list[str] = [
                 model_obj.model
                 for model_obj in ollama.list().models
@@ -657,12 +737,29 @@ class LiteLLMModel(BenchmarkModule):
                     api_version=benchmark_config.api_version,
                 )
                 return True
+            except (
+                APIConnectionError,
+                Timeout,
+                ServiceUnavailableError,
+                InternalServerError,
+            ) as e:
+                logger.debug(
+                    f"Service temporarily unavailable. The error message was: {e}. "
+                    "Retrying in 10 seconds..."
+                )
+                sleep(5)
+            except RateLimitError:
+                logger.warning(
+                    f"Rate limit exceeded for model {model_id!r}. Retrying in 10 "
+                    "seconds..."
+                )
+                sleep(10)
             except APIError as e:
                 if "'503 Service Unavailable" not in str(e):
                     raise e
                 logger.warning(
-                    f"Failed to check if model {model_id!r} exists. Retrying in "
-                    f"{num_attempts} seconds..."
+                    f"Failed to check if model {model_id!r} exists. Retrying in 10 "
+                    "seconds..."
                 )
                 sleep(10)
             except (BadRequestError, NotFoundError):
@@ -708,9 +805,10 @@ class LiteLLMModel(BenchmarkModule):
         Returns:
             The model configuration.
         """
+        model_id, revision = model_id.split("@") if "@" in model_id else (model_id, "")
         return ModelConfig(
             model_id=model_id,
-            revision="main",
+            revision=revision,
             task="text-generation",
             languages=list(),
             merge=False,
@@ -1025,3 +1123,35 @@ class LiteLLMModel(BenchmarkModule):
         examples["messages"] = messages_list
         return examples
+def raise_if_wrong_params(
+    model_config: ModelConfig, allowed_params: dict[str, list[str]]
+) -> None:
+    """Raise an error if the model configuration has invalid parameters.
+    Args:
+        model_config:
+            The model configuration.
+        allowed_params:
+            The allowed parameters for the model.
+    Raises:
+        InvalidModel:
+            If the model configuration has invalid parameters.
+    """
+    param = model_config.revision
+    if param == "":
+        return
+    for model_regex, allowed_params_list in allowed_params.items():
+        if re.fullmatch(pattern=model_regex, string=model_config.model_id):
+            if param not in allowed_params_list:
+                msg = (
+                    f"Invalid parameter {param!r} for model {model_config.model_id!r}."
+                )
+                if allowed_params_list:
+                    msg += f" Allowed parameters are: {', '.join(allowed_params_list)}."
+                else:
+                    msg += " No parameters are allowed."
+                raise InvalidModel(msg)
+            return

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -25,10 +25,10 @@ from urllib3.exceptions import RequestError
 from ..constants import (
     GENERATIVE_PIPELINE_TAGS,
+    MAX_CONTEXT_LENGTH,
     MAX_LOGPROBS,
     MERGE_TAGS,
     REASONING_MAX_TOKENS,
-    TASK_GROUPS_USING_LOGPROBS,
     TASKS_USING_JSON,
     VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
 )
@@ -66,6 +66,7 @@ from ..utils import (
     get_bos_token,
     get_end_of_chat_token_ids,
     get_eos_token,
+    get_first_label_token_mapping,
     get_min_cuda_compute_capability,
     log_once,
     should_prompts_be_stripped,
@@ -122,11 +123,8 @@ class VLLMModel(HuggingFaceEncoderModel):
         ):
             raise NeedsExtraInstalled(extra="generative")
-        output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
         model, tokenizer = load_model_and_tokenizer(
-            model_config=model_config,
-            benchmark_config=benchmark_config,
-            output_scores=output_scores,
+            model_config=model_config, benchmark_config=benchmark_config
         )
         self._model: LLM = model
         self._tokenizer: PreTrainedTokenizer = tokenizer
@@ -142,8 +140,12 @@ class VLLMModel(HuggingFaceEncoderModel):
             benchmark_config=benchmark_config,
         )
-        self.buffer["output_scores"] = output_scores
-        self.buffer["instruction_model"] = self._tokenizer.chat_template is not None
+        self.buffer |= dict(
+            instruction_model=self._tokenizer.chat_template is not None,
+            first_label_token_mapping=get_first_label_token_mapping(
+                dataset_config=self.dataset_config, tokenizer=self._tokenizer
+            ),
+        )
         if self.model_config.adapter_base_model_id is not None:
             adapter_path = snapshot_download(
                 repo_id=self.model_config.model_id,
@@ -185,6 +187,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                 return partial(
                     sequence_classification.extract_labels_from_generation,
                     dataset_config=self.dataset_config,
+                    first_label_token_mapping=self.buffer["first_label_token_mapping"],
                 )
             case TaskGroup.TEXT_TO_TEXT:
                 return text_to_text.extract_labels_from_generation
@@ -338,6 +341,12 @@ class VLLMModel(HuggingFaceEncoderModel):
         else:
             logits_processor = None
+        # Get the mapping from labels to the first token in the label. We call this each
+        # time we generate a new dataset since the dataset config can change
+        self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
+            dataset_config=self.dataset_config, tokenizer=self._tokenizer
+        )
         # Define the parameters used for vLLM generation
         max_tokens: int = (
             REASONING_MAX_TOKENS
@@ -346,7 +355,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         )
         sampling_params = SamplingParams(
             max_tokens=max_tokens,
-            logprobs=MAX_LOGPROBS if self.buffer["output_scores"] else None,
+            logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
             temperature=0.0,
             stop=[stop_token for stop_token in stop_tokens if stop_token],
             logits_processors=[logits_processor] if logits_processor else None,
@@ -416,7 +425,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         completions = [completion.strip() for completion in completions]
         # Add logprobs scores to the output
-        if self.buffer["output_scores"]:
+        if self.buffer["first_label_token_mapping"]:
             scores: list[list[list[tuple[str, float]]]] = [
                 [
                     [
@@ -846,7 +855,7 @@ class VLLMModel(HuggingFaceEncoderModel):
 def load_model_and_tokenizer(
-    model_config: ModelConfig, benchmark_config: BenchmarkConfig, output_scores: bool
+    model_config: ModelConfig, benchmark_config: BenchmarkConfig
 ) -> "tuple[LLM, PreTrainedTokenizer]":
     """Load the model and tokenizer.
@@ -855,11 +864,9 @@ def load_model_and_tokenizer(
             The model configuration.
         benchmark_config:
             The benchmark configuration.
-        output_scores:
-            Whether to output scores.
     Returns:
-        The loaded model and tokenizer.
+        A pair (model, tokenizer), with the loaded model and tokenizer
     """
     # Prefer base model ID if the model is an adapter - the adapter will be added on
     # during inference in this case
@@ -893,7 +900,27 @@ def load_model_and_tokenizer(
     if quantization == "awq" and importlib.util.find_spec("awq") is None:
         raise NeedsExtraInstalled(extra="quantization")
+    # Start with dtype being the "auto" vLLM dtype
     dtype: str | torch.dtype = "auto"
+    # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
+    if hf_model_config.torch_dtype == torch.float32:
+        if torch.cuda.is_bf16_supported():
+            logger.info(
+                "You are loading a model with dtype FP32, which we will convert to "
+                "BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
+                "GPU."
+            )
+            dtype = torch.bfloat16
+        else:
+            logger.info(
+                "You are loading a model with dtype FP32, which we will convert to "
+                "FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
+                "your GPU."
+            )
+            dtype = torch.float16
+    # If the model is a quantized model, we need to set the dtype to float16
     if quantization is not None and hf_model_config.torch_dtype != torch.float16:
         logger.info(
             "You are loading a quantized model with dtype "
@@ -902,6 +929,7 @@ def load_model_and_tokenizer(
         )
         dtype = torch.float16
+    # If the model is a bf16 model, we need to check the CUDA compute capability
     if hf_model_config.torch_dtype == torch.bfloat16:
         min_cuda_compute_capability = get_min_cuda_compute_capability()
         required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
@@ -940,7 +968,17 @@ def load_model_and_tokenizer(
     if len(true_max_model_len_candidates) > 0:
         true_max_model_len = min(true_max_model_len_candidates)
     else:
-        true_max_model_len = 5_000
+        true_max_model_len = MAX_CONTEXT_LENGTH
+    tokenizer = load_tokenizer(
+        model_id=model_config.model_id,
+        revision=model_config.revision,
+        adapter_base_model_id=model_config.adapter_base_model_id,
+        trust_remote_code=benchmark_config.trust_remote_code,
+        model_max_length=true_max_model_len,
+        model_cache_dir=model_config.model_cache_dir,
+        token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+    )
     clear_vllm()
@@ -951,7 +989,7 @@ def load_model_and_tokenizer(
             model=model_id,
             tokenizer=model_id,
             gpu_memory_utilization=0.95,
-            max_model_len=min(true_max_model_len, 5_000),
+            max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
             download_dir=download_dir,
             trust_remote_code=benchmark_config.trust_remote_code,
             revision=revision,
@@ -962,7 +1000,6 @@ def load_model_and_tokenizer(
             quantization=quantization,
             dtype=dtype,
             enforce_eager=True,
-            max_logprobs=MAX_LOGPROBS if output_scores else None,
             # TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
             # so we disable it for now
             enable_prefix_caching=False,
@@ -988,16 +1025,6 @@ def load_model_and_tokenizer(
     model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
     model.config = hf_model_config
-    tokenizer = load_tokenizer(
-        model_id=model_config.model_id,
-        revision=model_config.revision,
-        adapter_base_model_id=model_config.adapter_base_model_id,
-        trust_remote_code=benchmark_config.trust_remote_code,
-        model_max_length=true_max_model_len,
-        model_cache_dir=model_config.model_cache_dir,
-        token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
-    )
     return model, tokenizer
@@ -1157,15 +1184,13 @@ def get_end_of_reasoning_token_id(
     # Generate a completion and remove the BOS token from it, to not confuse it with the
     # potential reasoning token
-    completion = (
-        model.generate(
-            prompts=[prompt],
-            sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
-            use_tqdm=False,
-        )[0]
-        .outputs[0]
-        .text
+    model_output = model.generate(
+        prompts=[prompt],
+        sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
+        use_tqdm=False,
     )
+    completion = model_output[0].outputs[0].text
     if tokenizer.bos_token is not None:
         if isinstance(tokenizer.bos_token, str):
             prompt = prompt.replace(tokenizer.bos_token, "").strip()

euroeval/benchmarker.py CHANGED Viewed

@@ -366,14 +366,18 @@ class Benchmarker:
             dataset_names=benchmark_config.datasets
         )
+        total_benchmarks = len(model_ids) * len(dataset_configs)
+        num_finished_benchmarks = 0
         current_benchmark_results: list[BenchmarkResult] = list()
-        for m_id in model_ids:
+        for model_id in model_ids:
             try:
                 model_config = get_model_config(
-                    model_id=m_id, benchmark_config=benchmark_config
+                    model_id=model_id, benchmark_config=benchmark_config
                 )
             except InvalidModel as e:
                 logger.info(e.message)
+                num_finished_benchmarks += len(dataset_configs)
                 continue
             loaded_model: BenchmarkModule | None = None
@@ -381,16 +385,18 @@ class Benchmarker:
                 # Skip if we have already benchmarked this model on this dataset and
                 # we are not forcing the benchmark
                 if not benchmark_config.force and model_has_been_benchmarked(
-                    model_id=m_id,
+                    model_id=model_id,
                     dataset=dataset_config.name,
                     few_shot=benchmark_config.few_shot,
                     validation_split=not benchmark_config.evaluate_test_split,
                     benchmark_results=self.benchmark_results,
                 ):
                     logger.debug(
-                        f"Skipping benchmarking {m_id} on {dataset_config.pretty_name},"
-                        " as it has already been benchmarked."
+                        f"Skipping benchmarking {model_id} on "
+                        f"{dataset_config.pretty_name}, as it "
+                        "has already been benchmarked."
                     )
+                    num_finished_benchmarks += 1
                     continue
                 # We do not re-initialise generative models as their architecture is not
@@ -413,6 +419,15 @@ class Benchmarker:
                             if benchmark_config.raise_errors:
                                 raise e
                             logger.info(e.message)
+                            # Add the remaining number of benchmarks for the model to
+                            # our benchmark counter, since we're skipping the
+                            # rest of them
+                            num_finished_benchmarks += (
+                                len(dataset_configs)
+                                - dataset_configs.index(dataset_config)
+                                - 1
+                            )
                             break
                     else:
                         loaded_model.dataset_config = dataset_config
@@ -435,16 +450,24 @@ class Benchmarker:
                     if benchmark_config.raise_errors:
                         raise benchmark_output_or_err
                     logger.info(
-                        f"{m_id} could not be benchmarked on "
+                        f"{model_id} could not be benchmarked on "
                         f"{dataset_config.pretty_name}. Skipping. The error message "
                         f"raised was {benchmark_output_or_err.message!r}."
                     )
+                    num_finished_benchmarks += 1
                     continue
                 elif isinstance(benchmark_output_or_err, InvalidModel):
                     if benchmark_config.raise_errors:
                         raise benchmark_output_or_err
                     logger.info(benchmark_output_or_err.message)
+                    # Add the remaining number of benchmarks for the model to
+                    # our benchmark counter, since we're skipping the
+                    # rest of them
+                    num_finished_benchmarks += (
+                        len(dataset_configs) - dataset_configs.index(dataset_config) - 1
+                    )
                     break
                 else:
@@ -453,6 +476,12 @@ class Benchmarker:
                     if benchmark_config.save_results:
                         record.append_to_results(results_path=self.results_path)
+                num_finished_benchmarks += 1
+                logger.info(
+                    f"Finished {num_finished_benchmarks} out of "
+                    f"{total_benchmarks} benchmarks."
+                )
             if benchmark_config.clear_model_cache:
                 clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)

euroeval/constants.py CHANGED Viewed

@@ -7,6 +7,13 @@ from .tasks import NER
 DUMMY_FILL_VALUE = 100
+# This is the maximum allowed context length for models for the purpose of this
+# benchmark. We will still report the models' true maximum context length in the
+# metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
+# all tokens in the context.
+MAX_CONTEXT_LENGTH = 5_000
 # We need to raise the amount of tokens generated for reasoning models, to give them
 # time to think
 REASONING_MAX_TOKENS = 8_192
@@ -47,7 +54,7 @@ TASK_GROUPS_USING_LOGPROBS = [
 MAX_LOGPROBS = 10
-# We make sure to remove these metric attributed after each iteration, to avoid memory
+# We make sure to remove these metric attributes after each iteration, to avoid memory
 # leaks
 METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]

euroeval/data_models.py CHANGED Viewed

@@ -10,10 +10,9 @@ from dataclasses import dataclass, field
 import pydantic
 import torch
-from euroeval.utils import get_package_version
 from .enums import Device, InferenceBackend, ModelType, TaskGroup
 from .types import ScoreDict
+from .utils import get_package_version
 @dataclass

euroeval/dataset_configs.py CHANGED Viewed

@@ -1643,7 +1643,7 @@ ORANGE_SUM_CONFIG = DatasetConfig(
 ILPOST_SUM_CONFIG = DatasetConfig(
     name="ilpost-sum",
-    pretty_name="the truncated version of the Italian summarisation dataset IlPost",
+    pretty_name="the truncated version of the Italian summarisation dataset IlPost-Sum",
     huggingface_id="EuroEval/ilpost-sum",
     task=SUMM,
     languages=[IT],

euroeval/task_utils/sequence_classification.py CHANGED Viewed

@@ -10,6 +10,7 @@ import numpy as np
 from evaluate import EvaluationModule
 from ..data_models import BenchmarkConfig, GenerativeModelOutput
+from ..exceptions import InvalidBenchmark
 from ..utils import log_once, raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
@@ -110,6 +111,7 @@ def extract_labels_from_generation(
     input_batch: dict[str, list],
     model_output: GenerativeModelOutput,
     dataset_config: "DatasetConfig",
+    first_label_token_mapping: dict[str, str] | bool,
 ) -> list[str]:
     """Extract the predicted labels from the generated output.
@@ -121,13 +123,19 @@ def extract_labels_from_generation(
             The raw generated output of the model.
         dataset_config:
             The configuration of the dataset.
+        first_label_token_mapping:
+            A mapping from labels to the first token in each label, or alternatively a
+            Boolean value indicating whether the model should output scores (if the
+            mapping is outputted then the model will always output scores).
     Returns:
         The predicted labels.
     """
     if model_output.scores is not None:
         return get_closest_logprobs_labels(
-            generation_logprobs=model_output.scores, dataset_config=dataset_config
+            generation_logprobs=model_output.scores,
+            dataset_config=dataset_config,
+            first_label_token_mapping=first_label_token_mapping,
         )
     else:
         return get_closest_word_edit_labels(
@@ -138,6 +146,7 @@ def extract_labels_from_generation(
 def get_closest_logprobs_labels(
     generation_logprobs: list[list[list[tuple[str, float]]]],
     dataset_config: "DatasetConfig",
+    first_label_token_mapping: dict[str, str] | bool,
 ) -> list[str]:
     """Get the labels with the highest predicted logprob value.
@@ -152,6 +161,10 @@ def get_closest_logprobs_labels(
             (batch_size, num_tokens, num_logprobs).
         dataset_config:
             The configuration of the dataset.
+        first_label_token_mapping:
+            A mapping from labels to the first token in each label, or alternatively a
+            Boolean value indicating whether the model should output scores (if the
+            mapping is outputted then the model will always output scores).
     Returns:
         The predicted labels.
@@ -185,11 +198,29 @@ def get_closest_logprobs_labels(
                 generated_label = "".join(previously_generated_labels) + generated_label
                 # Get the candidate labels that starts with the generated label
-                candidate_output_labels = {
-                    candidate_label
-                    for candidate_label in candidate_labels
-                    if candidate_label.startswith(generated_label)
-                }
+                if isinstance(first_label_token_mapping, dict):
+                    if any(
+                        candidate_label not in first_label_token_mapping
+                        for candidate_label in candidate_labels
+                    ):
+                        raise InvalidBenchmark(
+                            "There is a label not present in the first label token "
+                            "mapping - this should never happen! Please report this "
+                            "issue to the EuroEval team at "
+                            "github.com/EuroEval/EuroEval/issues."
+                        )
+                    candidate_output_labels = {
+                        candidate_label
+                        for candidate_label in candidate_labels
+                        if generated_label == first_label_token_mapping[candidate_label]
+                    }
+                else:
+                    candidate_output_labels = {
+                        candidate_label
+                        for candidate_label in candidate_labels
+                        if candidate_label.startswith(generated_label)
+                    }
                 # If we can uniquely determine the output label, we break the loop. If
                 # there are multiple possible labels then we store the current one, and
@@ -206,7 +237,7 @@ def get_closest_logprobs_labels(
                     else:
                         output_label = candidate_output_labels.pop()
                         candidate_output_labels.add(output_label)
-                        log_once(
+                        raise InvalidBenchmark(
                             "Multiple candidate labels found for the generated label "
                             f"{generated_label!r}: {candidate_output_labels}. Since "
                             "this is not the first generated label, we cannot "
@@ -214,9 +245,13 @@ def get_closest_logprobs_labels(
                             f"forced to use the arbitrary {output_label!r} as the "
                             "output label, potentially resulting in worse performance. "
                             "Please report this issue to the EuroEval team at "
-                            "github.com/EuroEval/EuroEval/issues.",
-                            level=logging.WARNING,
+                            "github.com/EuroEval/EuroEval/issues."
                         )
+                elif len(candidate_output_labels) == 0:
+                    logger.debug(
+                        f"No candidate label found for the generated label "
+                        f"{generated_label!r}. The generated label is thus ignored."
+                    )
             if output_label is not None:
                 output_labels.append(output_label)

euroeval/utils.py CHANGED Viewed

@@ -7,12 +7,12 @@ import importlib.util
 import logging
 import os
 import random
+import re
 import sys
 import typing as t
 import warnings
 from functools import cache
 from pathlib import Path
-from types import TracebackType
 import litellm
 import numpy as np
@@ -20,7 +20,6 @@ import requests
 import torch
 from datasets.utils import disable_progress_bar
 from requests.exceptions import RequestException
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
 from transformers import logging as tf_logging
 from .exceptions import InvalidModel, NaNValueInModelOutput
@@ -29,6 +28,11 @@ if importlib.util.find_spec("ray") is not None:
     import ray
 if t.TYPE_CHECKING:
+    from types import TracebackType
+    from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
+    from .data_models import DatasetConfig
     from .types import Predictions
@@ -285,7 +289,7 @@ class HiddenPrints:
         self,
         exc_type: t.Type[BaseException],
         exc_val: BaseException,
-        exc_tb: TracebackType,
+        exc_tb: "TracebackType",
     ) -> None:
         """Exit the context manager."""
         sys.stdout.close()
@@ -355,7 +359,6 @@ def should_prompts_be_stripped(
     return strip_prompts
-# TODO: This is currently not used - maybe remove.
 def should_prefix_space_be_added_to_labels(
     labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
 ) -> bool:
@@ -576,3 +579,96 @@ def get_package_version(package_name: str) -> str | None:
         return importlib.metadata.version(package_name)
     except importlib.metadata.PackageNotFoundError:
         return None
+def get_first_label_token_mapping(
+    dataset_config: "DatasetConfig", tokenizer: "PreTrainedTokenizer | None"
+) -> dict[str, str] | bool:
+    """Check if the model should output scores.
+    Args:
+        dataset_config:
+            The dataset configuration.
+        tokenizer:
+            The tokenizer, or None if not available.
+    Returns:
+        A mapping from labels to the first token in each label, or alternatively a
+        Boolean value indicating whether the model should output scores (if the mapping
+        is outputted then the model will always output scores).
+    """
+    # Importing here to avoid circular imports
+    from .constants import TASK_GROUPS_USING_LOGPROBS
+    # If we do not have any tokenizer, then we cannot check if the model should output
+    # scores and we just assume it should if the dataset supports it
+    output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
+    if tokenizer is None:
+        if output_scores:
+            log_once(
+                "The model will output scores, since the dataset supports it and no "
+                "tokenizer is available.",
+                level=logging.DEBUG,
+            )
+        else:
+            log_once(
+                "The model will not output scores, since the dataset does not support "
+                "it and no tokenizer is available.",
+                level=logging.DEBUG,
+            )
+        return output_scores
+    # If there are labels associated with the dataset, and that the first token of each
+    # label is distinct, then we can safely use the logprobs
+    if output_scores and dataset_config.labels:
+        local_labels = [
+            dataset_config.prompt_label_mapping[label].strip()
+            for label in dataset_config.labels
+        ]
+        # Get the first token of each label, where we add a prefix space if needed
+        add_prefix_space = (
+            should_prefix_space_be_added_to_labels(
+                labels_to_be_generated=local_labels, tokenizer=tokenizer
+            )
+            and tokenizer.chat_template is None
+        )
+        first_tokens = [
+            tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
+            for label in local_labels
+        ]
+        first_tokens = [
+            re.sub(
+                pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
+            )
+            for token in first_tokens
+        ]
+        # Build a mapping from labels to the first token in each label if the first
+        # tokens are distinct
+        if len(first_tokens) == len(set(first_tokens)):
+            log_once(
+                "The model will output scores, since the first tokens of the labels "
+                "are distinct.",
+                level=logging.DEBUG,
+            )
+            return {
+                label: first_token
+                for label, first_token in zip(local_labels, first_tokens)
+            }
+        else:
+            log_once(
+                "The model will not output scores, since the first tokens of the "
+                "labels are not distinct. The first tokens for the labels "
+                f"{local_labels} are {first_tokens}"
+            )
+            return False
+    # Otherwise, we assume that the model should not output scores, to avoid potential
+    # evaluation errors. This will force the label extraction to rely on word edit
+    # distance instead of logprobs.
+    log_once(
+        "The model will not output scores, since the dataset does not have labels.",
+        level=logging.DEBUG,
+    )
+    return False

{euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.4.2
+Version: 15.5.0
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -37,7 +37,7 @@ Requires-Dist: demjson3>=3.0.6
 Requires-Dist: evaluate>=0.4.1
 Requires-Dist: huggingface-hub>=0.24.0
 Requires-Dist: levenshtein>=0.24.0
-Requires-Dist: litellm>=1.61.13
+Requires-Dist: litellm>=1.63.0
 Requires-Dist: more-itertools>=10.5.0
 Requires-Dist: numpy<2.0.0,>=1.23.0
 Requires-Dist: ollama>=0.4.7
@@ -62,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: gradio>=4.26.0; extra == 'all'
 Requires-Dist: outlines>=0.1.11; extra == 'all'
-Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
 Provides-Extra: generative
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: outlines>=0.1.11; extra == 'generative'
-Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
 Provides-Extra: human-evaluation
 Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
 Provides-Extra: test
@@ -218,6 +218,7 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
 $ uv run src/scripts/create_allocine.py
 ```
 ## Special Thanks :pray:
 - Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
   models on the leaderboards.

{euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-euroeval/__init__.py,sha256=l3V3ybiCj0I193jvn8wS9VK4UEc9ajiOq4SojChH6Xs,2615
+euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
 euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
-euroeval/benchmarker.py,sha256=PIdqLPleLN3nml5Zb1g_dQaLzqxQhmgC8VuvD5yloV4,46524
+euroeval/benchmarker.py,sha256=8Qt1NL7k5n-AfFrhR6139wmmsVS7CgRa-QjminH0d_c,47849
 euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
 euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
-euroeval/constants.py,sha256=zL8dm7SEFpIgC2vaPhqzdKydVSWW-ZyMHenWPnNxWqQ,1681
+euroeval/constants.py,sha256=CJavEDvKLSKAC4uyz44sFrY1W1AnjUsxkXF63SoMjw4,1985
 euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
-euroeval/data_models.py,sha256=b4rOMdhoxkIPcnTQdwqq5iWaF6uia1OzAgdiOBvoGVM,14779
-euroeval/dataset_configs.py,sha256=C5Gnp95cBeCmmuRA8Rznt0c4gMOn8Pilk_kDCleDMjg,90640
+euroeval/data_models.py,sha256=QssdR_msDTmsp9yKe0cVba0iCpgBTFTOaOUn44o1cl8,14770
+euroeval/dataset_configs.py,sha256=6WiRW-VAAMIL6-1J6Nb6pCm6mf4I-oQ087zB0es3HHs,90644
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
 euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
 euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
@@ -20,21 +20,21 @@ euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
 euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
 euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
 euroeval/types.py,sha256=5DIhaVyzH8RO9jdJfibX9pwbZviQwU35dMsfszD2Whs,2406
-euroeval/utils.py,sha256=CFjYMoKdcxLUEM-aF3pxf_3TnGWvGasjfb8pDMJVe9U,18772
+euroeval/utils.py,sha256=bbq7WCcIrMKjBRaZ8EcnRpRAvL_F-tCxiL0We_po3QE,22397
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
 euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
-euroeval/benchmark_modules/hf.py,sha256=Typig7WDqOn_uGE24s_P_9PHvq-V0MrKGD7xbh0aYnk,43244
-euroeval/benchmark_modules/litellm.py,sha256=ZJ9dB683pXPHDf70OOJfmHn_y706xRYzstYLz2ytCKE,39784
-euroeval/benchmark_modules/vllm.py,sha256=O8-dcVkU2jgZer44EOeTC8E4d-xQjPDOXnoyzXxAToQ,46179
+euroeval/benchmark_modules/hf.py,sha256=VcgWZmSZc4B3FgeUGC0eWQIRv97luU22-KijaBfuqU0,43602
+euroeval/benchmark_modules/litellm.py,sha256=pbTsq6Bb8cnFbdZMUSrUs-XlNAyaCIWNcEKKRIfprx8,45161
+euroeval/benchmark_modules/vllm.py,sha256=7AZrvcwHevrQbXvbjTCp4S6HpM0Obk6CIQLbmUWIn9s,47483
 euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
 euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
-euroeval/task_utils/sequence_classification.py,sha256=832iWpPR3CsnlBIYA976eN21WUFQLUmIlDxFIvOsROk,10266
+euroeval/task_utils/sequence_classification.py,sha256=JDZfiTj5RdwYwlhhTqVBj2mVdwmkoykZ6wJzEbWj0lo,12225
 euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
 euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
-euroeval-15.4.2.dist-info/METADATA,sha256=cvpyWIKPXNKn1Idv7w3C7z8MBVljmw50jBdskL_32oI,10752
-euroeval-15.4.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.4.2.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.4.2.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
-euroeval-15.4.2.dist-info/RECORD,,
+euroeval-15.5.0.dist-info/METADATA,sha256=T48YoPuFBEFI5sxgUadzkD3tidIB3TA1mKEKsFuh7fs,10752
+euroeval-15.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.5.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.5.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
+euroeval-15.5.0.dist-info/RECORD,,

{euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.4.2__py3-none-any.whl → 15.5.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.4.2py3-none-any.whl → 15.5.0py3-none-any.whl