PyPI - EuroEval - Versions diffs - 16.2.1__py3-none-any.whl → 16.3.0__py3-none-any.whl - Mend

EuroEval 16.2.1py3-none-any.whl → 16.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (39) hide show

euroeval/__init__.py +4 -2
euroeval/benchmark_modules/fresh.py +3 -1
euroeval/benchmark_modules/hf.py +8 -4
euroeval/benchmark_modules/litellm.py +5 -17
euroeval/benchmark_modules/vllm.py +98 -30
euroeval/benchmarker.py +291 -405
euroeval/cli.py +1 -1
euroeval/constants.py +3 -0
euroeval/data_models.py +35 -35
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +0 -2
euroeval/dataset_configs/dutch.py +0 -2
euroeval/dataset_configs/english.py +0 -2
euroeval/dataset_configs/finnish.py +0 -2
euroeval/dataset_configs/french.py +0 -2
euroeval/dataset_configs/german.py +0 -2
euroeval/dataset_configs/italian.py +0 -2
euroeval/dataset_configs/latvian.py +2 -3
euroeval/dataset_configs/lithuanian.py +62 -0
euroeval/dataset_configs/norwegian.py +0 -2
euroeval/dataset_configs/polish.py +0 -2
euroeval/dataset_configs/portuguese.py +0 -2
euroeval/dataset_configs/spanish.py +0 -2
euroeval/dataset_configs/swedish.py +0 -3
euroeval/metrics/huggingface.py +1 -1
euroeval/metrics/pipeline.py +5 -0
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +9 -0
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +10 -0
euroeval/prompt_templates/sentiment_classification.py +11 -0
euroeval/tokenisation_utils.py +8 -8
euroeval/utils.py +10 -5
{euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/METADATA +181 -60
euroeval-16.3.0.dist-info/RECORD +71 -0
euroeval-16.2.1.dist-info/RECORD +0 -70
{euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/WHEEL +0 -0
{euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -103,8 +103,10 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
 os.environ["VLLM_USE_V1"] = "1"
-# Use the FlashInfer flash-attention backend for vLLM
-os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
+# Use the FlashInfer flash-attention backend for vLLM, unless the user has already
+# specified a different backend.
+if os.getenv("VLLM_ATTENTION_BACKEND") is None:
+    os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
 # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the

euroeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Freshly initialised encoder models."""
+import re
 import typing as t
 from functools import cached_property
 from json import JSONDecodeError
@@ -45,6 +46,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
     """A freshly initialised encoder model."""
     fresh_model = True
+    allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
     def __init__(
         self,
@@ -294,7 +296,7 @@ def load_model_and_tokeniser(
             token=get_hf_token(api_key=benchmark_config.api_key),
             add_prefix_space=prefix,
             cache_dir=model_config.model_cache_dir,
-            use_fast=True,
+            use_fast=False if model_config.param == "slow-tokenizer" else True,
             verbose=False,
             trust_remote_code=benchmark_config.trust_remote_code,
         )

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import collections.abc as c
 import logging
+import re
 import typing as t
 from functools import cached_property, partial
 from json import JSONDecodeError
@@ -93,6 +94,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
     fresh_model = False
     batching_preference = BatchingPreference.NO_PREFERENCE
     high_priority = True
+    allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
     def __init__(
         self,
@@ -690,7 +692,7 @@ def load_model_and_tokeniser(
         model=model,
         model_id=model_id,
         trust_remote_code=benchmark_config.trust_remote_code,
-        model_cache_dir=model_config.model_cache_dir,
+        model_config=model_config,
     )
     return model, tokeniser
@@ -880,7 +882,7 @@ def load_tokeniser(
     model: "PreTrainedModel | None",
     model_id: str,
     trust_remote_code: bool,
-    model_cache_dir: str,
+    model_config: "ModelConfig",
 ) -> "PreTrainedTokenizer":
     """Load the tokeniser.
@@ -892,17 +894,19 @@ def load_tokeniser(
             The model identifier. Used for logging.
         trust_remote_code:
             Whether to trust remote code.
+        model_config:
+            The model configuration.
     Returns:
         The loaded tokeniser.
     """
     loading_kwargs: dict[str, bool | str] = dict(
-        use_fast=True,
+        use_fast=False if model_config.param == "slow-tokenizer" else True,
         verbose=False,
         trust_remote_code=trust_remote_code,
         padding_side="right",
         truncation_side="right",
-        cache_dir=model_cache_dir,
+        cache_dir=model_config.model_cache_dir,
     )
     # If the model is a subclass of a certain model types then we have to add a prefix

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -369,7 +369,8 @@ class LiteLLMModel(BenchmarkModule):
             ]
             logger.debug(
                 f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
-                f"{len(inputs_to_run):,} failed message(s)"
+                f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
+                f"{failures[0][1]}."
             )
             # Attempt to handle the exceptions, to improve the chance of getting
@@ -453,8 +454,7 @@ class LiteLLMModel(BenchmarkModule):
         requires_thinking_disabled_messages = ["thinking.type: Field required"]
         seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
         response_format_messages = [
-            "got an unexpected keyword argument 'response_format'",
-            "The model outputs empty dictionaries.",
+            "got an unexpected keyword argument 'response_format'"
         ]
         if any(msg.lower() in error_msg for msg in stop_messages):
@@ -713,18 +713,6 @@ class LiteLLMModel(BenchmarkModule):
             ]
         responses = await tqdm_async.gather(*requests, leave=False)
-        # If we are performing structured generation and the model just outputs an empty
-        # dictionary, then we convert those to exceptions, to disable structured
-        # generation
-        if "response_format" in generation_kwargs:
-            responses = [
-                RuntimeError("The model outputs empty dictionaries.")
-                if not isinstance(response, Exception)
-                and any(choice.message.content == "{}" for choice in response.choices)
-                else response
-                for response in responses
-            ]
         # Separate the successful responses from the failed ones
         successes = [
             (idx, response)
@@ -984,7 +972,7 @@ class LiteLLMModel(BenchmarkModule):
                     model=None,
                     model_id=model_id,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
-                    model_cache_dir=self.model_config.model_cache_dir,
+                    model_config=self.model_config,
                 )
                 if (
@@ -1067,7 +1055,7 @@ class LiteLLMModel(BenchmarkModule):
                     model=None,
                     model_id=model_id,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
-                    model_cache_dir=self.model_config.model_cache_dir,
+                    model_config=self.model_config,
                 )
                 all_max_lengths: list[int] = list()

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -104,7 +104,7 @@ class VLLMModel(HuggingFaceEncoderModel):
     fresh_model = False
     batching_preference = BatchingPreference.ALL_AT_ONCE
     high_priority = True
-    allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
+    allowed_params = {re.compile(r".*"): ["thinking", "no-thinking", "slow-tokenizer"]}
     def __init__(
         self,
@@ -559,11 +559,34 @@ class VLLMModel(HuggingFaceEncoderModel):
                 torch.LongTensor(completion_id) for completion_id in completion_ids
             ]
         )
-        if self.end_of_reasoning_token is not None:
-            completions = [
-                completion.split(self.end_of_reasoning_token)[-1]
-                for completion in completions
-            ]
+        if (
+            self.end_of_reasoning_token is not None
+            and self.generative_type == GenerativeType.REASONING
+        ):
+            for idx in range(len(completions)):
+                if self.end_of_reasoning_token in completions[idx]:
+                    completions[idx] = completions[idx].split(
+                        self.end_of_reasoning_token
+                    )[-1]
+                elif self.benchmark_config.verbose:
+                    logger.warning(
+                        f"The model {self.model_config.model_id!r} is a reasoning "
+                        "model, but the generated output does not contain the end of "
+                        f"reasoning token ({self.end_of_reasoning_token!r}). Using "
+                        "an empty string as the prediction instead."
+                    )
+                    completions[idx] = ""
+                else:
+                    log_once(
+                        f"The model {self.model_config.model_id!r} is a reasoning "
+                        "model, but the generated output does not contain the end of "
+                        f"reasoning token ({self.end_of_reasoning_token!r}). Using "
+                        "an empty string as the prediction instead. Only showing "
+                        "this warning once - see all occurrences if you run with the "
+                        "`verbose` flag.",
+                        level=logging.WARNING,
+                    )
+                    completions[idx] = ""
         stop_token_pattern = re.compile(
             "|".join(re.escape(stop_token) for stop_token in stop_tokens)
         )
@@ -830,21 +853,27 @@ def load_model_and_tokeniser(
         adapter_base_model_id=model_config.adapter_base_model_id,
         trust_remote_code=benchmark_config.trust_remote_code,
         model_max_length=true_max_model_len,
-        model_cache_dir=model_config.model_cache_dir,
+        model_config=model_config,
         token=get_hf_token(api_key=benchmark_config.api_key),
     )
+    vllm_tokenisation_params = get_vllm_tokenisation_params(
+        tokeniser=tokeniser, model_config=model_config
+    )
     clear_vllm()
-    # if we do not have an internet connection we need to give the path to the folder
-    # that contains the model weights and config files, otherwise vLLM will try to
-    # download them regardless if they are already present in the download_dir
-    model_path = resolve_model_path(download_dir)
     try:
         model = LLM(
-            model=model_id if internet_connection_available() else model_path,
-            tokenizer=model_id if internet_connection_available() else model_path,
+            model=(
+                model_id
+                if internet_connection_available()
+                else resolve_model_path(download_dir=download_dir)
+            ),
+            tokenizer=(
+                model_id
+                if internet_connection_available()
+                else resolve_model_path(download_dir=download_dir)
+            ),
             gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
             max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
             download_dir=download_dir,
@@ -862,16 +891,7 @@ def load_model_and_tokeniser(
             enable_prefix_caching=False,
             enable_lora=model_config.adapter_base_model_id is not None,
             max_lora_rank=256,
-            # Special arguments in case we are dealing with a Mistral model
-            tokenizer_mode="mistral"
-            if isinstance(tokeniser, MistralCommonTokenizer)
-            else "auto",
-            config_format="mistral"
-            if isinstance(tokeniser, MistralCommonTokenizer)
-            else "auto",
-            load_format="mistral"
-            if isinstance(tokeniser, MistralCommonTokenizer)
-            else "auto",
+            **vllm_tokenisation_params,
         )
     except (RuntimeError, ValueError, OSError) as e:
         if "awaiting a review from the repo authors" in str(e):
@@ -900,7 +920,7 @@ def load_tokeniser(
     adapter_base_model_id: str | None,
     trust_remote_code: bool,
     model_max_length: int,
-    model_cache_dir: str,
+    model_config: "ModelConfig",
     token: str | bool,
 ) -> "PreTrainedTokenizer":
     """Load the tokeniser.
@@ -917,8 +937,8 @@ def load_tokeniser(
             Whether to trust remote code.
         model_max_length:
             The maximum length of the model.
-        model_cache_dir:
-            The cache directory for the model.
+        model_config:
+            The model configuration.
         token:
             The Hugging Face API token.
@@ -929,7 +949,7 @@ def load_tokeniser(
     config = AutoConfig.from_pretrained(
         adapter_base_model_id or model_id,
         revision=revision,
-        cache_dir=model_cache_dir,
+        cache_dir=model_config.model_cache_dir,
         token=token,
         trust_remote_code=trust_remote_code,
         local_files_only=not internet_connection_available(),
@@ -937,15 +957,25 @@ def load_tokeniser(
     num_retries = 5
     for _ in range(num_retries):
         try:
+            # Mistral instruction-tuned models need a custom tokeniser
+            if model_id.startswith("mistralai/") and "base" not in model_id.lower():
+                tokeniser = MistralCommonTokenizer.from_pretrained(
+                    model_id,
+                    padding_side="left",
+                    truncation_side="left",
+                    model_max_length=model_max_length,
+                    token=token,
+                )
+                break
             tokeniser = AutoTokenizer.from_pretrained(
                 model_id,
-                use_fast=True,
+                use_fast=False if model_config.param == "slow-tokenizer" else True,
                 verbose=False,
                 trust_remote_code=trust_remote_code,
                 padding_side="left",
                 truncation_side="left",
                 model_max_length=model_max_length,
-                cache_dir=model_cache_dir,
+                cache_dir=model_config.model_cache_dir,
                 config=config,
                 token=token,
                 local_files_only=not internet_connection_available(),
@@ -1186,3 +1216,41 @@ def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
     """
     tqdm_kwargs.pop("leave", None)  # Remove the 'leave' key if it exists
     return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
+def get_vllm_tokenisation_params(
+    tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
+) -> dict[str, t.Any]:
+    """Get the tokenisation parameters for vLLM.
+    Args:
+        tokeniser:
+            The tokeniser.
+        model_config:
+            The model configuration.
+    Returns:
+        A dictionary of tokenisation parameters to pass to vLLM.
+    """
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        tokeniser_mode = "mistral"
+    elif model_config.param == "slow-tokenizer":
+        tokeniser_mode = "slow"
+    else:
+        tokeniser_mode = "auto"
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        config_format = "mistral"
+    else:
+        config_format = "auto"
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        load_format = "mistral"
+    else:
+        load_format = "auto"
+    return dict(
+        tokenizer_mode=tokeniser_mode,
+        config_format=config_format,
+        load_format=load_format,
+    )

EuroEval 16.2.1__py3-none-any.whl → 16.3.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.2.1py3-none-any.whl → 16.3.0py3-none-any.whl