PyPI - EuroEval - Versions diffs - 15.5.0__py3-none-any.whl → 15.6.1__py3-none-any.whl - Mend

EuroEval 15.5.0py3-none-any.whl → 15.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (53) hide show

euroeval/benchmark_modules/base.py +3 -2
euroeval/benchmark_modules/fresh.py +8 -6
euroeval/benchmark_modules/hf.py +33 -31
euroeval/benchmark_modules/litellm.py +120 -56
euroeval/benchmark_modules/vllm.py +41 -26
euroeval/benchmarker.py +23 -21
euroeval/callbacks.py +2 -2
euroeval/constants.py +1 -1
euroeval/data_models.py +261 -42
euroeval/dataset_configs/__init__.py +61 -0
euroeval/dataset_configs/danish.py +120 -0
euroeval/dataset_configs/dutch.py +123 -0
euroeval/dataset_configs/english.py +88 -0
euroeval/dataset_configs/faroese.py +54 -0
euroeval/dataset_configs/french.py +83 -0
euroeval/dataset_configs/german.py +91 -0
euroeval/dataset_configs/icelandic.py +148 -0
euroeval/dataset_configs/italian.py +81 -0
euroeval/dataset_configs/norwegian.py +178 -0
euroeval/dataset_configs/spanish.py +78 -0
euroeval/dataset_configs/swedish.py +100 -0
euroeval/exceptions.py +10 -10
euroeval/finetuning.py +6 -10
euroeval/generation.py +1 -0
euroeval/human_evaluation.py +2 -2
euroeval/languages.py +20 -13
euroeval/model_cache.py +1 -1
euroeval/model_loading.py +1 -12
euroeval/prompt_templates/__init__.py +8 -0
euroeval/prompt_templates/linguistic_acceptability.py +112 -0
euroeval/prompt_templates/multiple_choice.py +97 -0
euroeval/prompt_templates/named_entity_recognition.py +257 -0
euroeval/prompt_templates/reading_comprehension.py +118 -0
euroeval/prompt_templates/sentiment_classification.py +137 -0
euroeval/prompt_templates/summarization.py +97 -0
euroeval/speed_benchmark.py +1 -1
euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
euroeval/{task_utils → task_group_utils}/sequence_classification.py +1 -1
euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
euroeval/tasks.py +54 -0
euroeval/tokenization_utils.py +343 -0
euroeval/types.py +3 -1
euroeval/utils.py +2 -347
{euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/METADATA +31 -9
euroeval-15.6.1.dist-info/RECORD +59 -0
euroeval/dataset_configs.py +0 -2408
euroeval-15.5.0.dist-info/RECORD +0 -40
/euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
{euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/WHEEL +0 -0
{euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/entry_points.txt +0 -0
{euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/base.py CHANGED Viewed

@@ -10,7 +10,8 @@ from functools import cached_property, partial
 from datasets import DatasetDict
 from torch import nn
 from tqdm.auto import tqdm
-from transformers import PreTrainedTokenizer, Trainer
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.trainer import Trainer
 from ..data_models import (
     BenchmarkConfig,
@@ -21,7 +22,7 @@ from ..data_models import (
 )
 from ..enums import BatchingPreference, GenerativeType, TaskGroup
 from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
-from ..task_utils import (
+from ..task_group_utils import (
     question_answering,
     sequence_classification,
     text_to_text,

euroeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -4,19 +4,21 @@ import os
 from functools import cached_property
 from json import JSONDecodeError
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.auto.configuration_auto import AutoConfig
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers.models.electra import (
     ElectraForQuestionAnswering,
     ElectraForSequenceClassification,
     ElectraForTokenClassification,
-    PretrainedConfig,
-    PreTrainedModel,
-    PreTrainedTokenizer,
+)
+from transformers.models.xlm_roberta import (
     XLMRobertaForQuestionAnswering,
     XLMRobertaForSequenceClassification,
     XLMRobertaForTokenClassification,
 )
+from transformers.tokenization_utils import PreTrainedTokenizer
 from ..data_models import BenchmarkConfig, DatasetConfig, ModelConfig
 from ..enums import InferenceBackend, ModelType, TaskGroup

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -13,31 +13,29 @@ import torch
 from datasets import DatasetDict
 from huggingface_hub import HfApi
 from huggingface_hub import whoami as hf_whoami
-from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
-from huggingface_hub.hf_api import RepositoryNotFoundError, RevisionNotFoundError
-from huggingface_hub.utils import (
+from huggingface_hub.errors import (
     GatedRepoError,
     HFValidationError,
     LocalTokenNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
 )
+from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
 from peft import PeftConfig
 from requests.exceptions import RequestException
 from torch import nn
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    BatchEncoding,
+from transformers.configuration_utils import PretrainedConfig
+from transformers.data.data_collator import (
     DataCollatorForTokenClassification,
     DataCollatorWithPadding,
-    PretrainedConfig,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    Trainer,
 )
 from transformers.modelcard import TASK_MAPPING
-from transformers.models.auto.modeling_auto import (
-    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
-)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.auto.configuration_auto import AutoConfig
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import BatchEncoding
+from transformers.trainer import Trainer
 from urllib3.exceptions import RequestError
 from ..constants import (
@@ -65,18 +63,17 @@ from ..exceptions import (
     NoInternetConnection,
 )
 from ..languages import get_all_languages
-from ..task_utils import (
+from ..task_group_utils import (
     multiple_choice_classification,
     question_answering,
     token_classification,
 )
+from ..tokenization_utils import get_bos_token, get_eos_token
 from ..types import ExtractLabelsFunction
 from ..utils import (
     block_terminal_output,
     create_model_cache_dir,
-    get_bos_token,
     get_class_by_name,
-    get_eos_token,
     internet_connection_available,
     log_once,
 )
@@ -690,7 +687,7 @@ def load_model_and_tokenizer(
     assert model is not None, "The model should not be None."
     model.eval()
-    model.to(benchmark_config.device)
+    model.to(benchmark_config.device)  # type: ignore[arg-type]
     if (
         isinstance(model, PreTrainedModel)
@@ -797,12 +794,6 @@ def get_model_repo_info(
             tags += base_model_info.tags or list()
             tags = list(set(tags))
-    # TEMP: This extends the `TASK_MAPPING` dictionary to include the missing
-    # 'image-text-to-text' pipeline tag. This will be added as part of `TASK_MAPPING`
-    # when this PR has been merged in and published:
-    # https://github.com/huggingface/transformers/pull/37107
-    TASK_MAPPING["image-text-to-text"] = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
     # Get the pipeline tag for the model. If it is not specified, then we determine it
     # by checking the model's architecture as written in the model's Hugging Face config
     pipeline_tag = model_info.pipeline_tag
@@ -824,7 +815,7 @@ def get_model_repo_info(
         generative_class_names = [
             class_name
             for tag in GENERATIVE_PIPELINE_TAGS
-            for class_name in TASK_MAPPING.get(tag, dict()).values()
+            for class_name in TASK_MAPPING.get(tag, dict()).values()  # type: ignore[attr-defined]
         ]
         if class_names is not None and any(
             class_name in generative_class_names for class_name in class_names
@@ -1083,17 +1074,20 @@ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedM
         for attribute in attribute_list:
             token_type_embeddings = getattr(token_type_embeddings, attribute)
+        token_type_embedding_tensor = token_type_embeddings.weight.data
+        assert isinstance(token_type_embedding_tensor, torch.Tensor)
         # If the token type embeddings has shape (1, ...) then set the shape to
         # (2, ...) by randomly initializing the second token type embedding
-        if token_type_embeddings.weight.data.shape[0] == 1:
+        if token_type_embedding_tensor.shape[0] == 1:
             token_type_embeddings.weight.data = torch.cat(
                 (
-                    token_type_embeddings.weight.data,
-                    torch.rand_like(token_type_embeddings.weight.data),
+                    token_type_embedding_tensor,
+                    torch.rand_like(token_type_embedding_tensor),
                 ),
                 dim=0,
             )
-            token_type_embeddings.num_embeddings = 2
+            token_type_embeddings.num_embeddings = 2  # type: ignore[assignment]
         # Set the model config to use the new type vocab size
         model.config.type_vocab_size = 2
@@ -1160,7 +1154,7 @@ def align_model_and_tokenizer(
     # Move the model to the CPU, since otherwise we can't catch the IndexErrors when
     # finding the maximum sequence length of the model
     model_device = model.device
-    model.to(torch.device("cpu"))
+    model.to(torch.device("cpu"))  # type: ignore[arg-type]
     # Manually check that this model max length is valid for the model, and adjust
     # otherwise
@@ -1182,8 +1176,16 @@ def align_model_and_tokenizer(
             except IndexError:
                 continue
+            except ValueError as e:
+                # This happens when the model is using Triton, such as with ModernBERT,
+                # which doesn't work with CPU tensors at all
+                if "cpu tensor" in str(e):
+                    break
+                else:
+                    raise e
     # Move the model back to the original device
-    model.to(model_device)
+    model.to(model_device)  # type: ignore[arg-type]
     # If there is a mismatch between the vocab size according to the tokenizer and
     # the vocab size according to the model, we raise an error

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -32,10 +32,10 @@ from litellm.exceptions import (
     Timeout,
 )
 from litellm.llms.vertex_ai.common_utils import VertexAIError
-from litellm.types.utils import ModelResponse
+from litellm.types.utils import ChoiceLogprobs, ModelResponse
 from requests.exceptions import RequestException
 from tqdm.auto import tqdm
-from transformers import Trainer
+from transformers.trainer import Trainer
 from ..constants import MAX_LOGPROBS, REASONING_MAX_TOKENS, TASKS_USING_JSON
 from ..data_models import (
@@ -59,14 +59,15 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
-from ..task_utils import (
+from ..task_group_utils import (
     question_answering,
     sequence_classification,
     text_to_text,
     token_classification,
 )
+from ..tokenization_utils import get_first_label_token_mapping
 from ..types import ExtractLabelsFunction
-from ..utils import create_model_cache_dir, get_first_label_token_mapping, log_once
+from ..utils import create_model_cache_dir, log_once
 from .base import BenchmarkModule
 from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
@@ -316,7 +317,7 @@ class LiteLLMModel(BenchmarkModule):
                 elif isinstance(e, RateLimitError):
                     raise InvalidModel(
                         "You have encountered your rate limit for model "
-                        f"{self.model_config.model_id!r}. The error message was: {e}"
+                        f"{self.model_config.model_id!r}. Skipping."
                     )
                 else:
                     raise InvalidBenchmark(
@@ -366,14 +367,22 @@ class LiteLLMModel(BenchmarkModule):
         # Structure the model output as a GenerativeModelOutput object
         model_output = GenerativeModelOutput(sequences=[generation_output])
         if hasattr(model_response_choices, "logprobs"):
-            logprobs_list: list[list[tuple[str, float]]] = [
-                [
-                    (top_logprob.token, top_logprob.logprob)
-                    for top_logprob in content.top_logprobs
+            logprobs_obj = model_response_choices.logprobs
+            if isinstance(logprobs_obj, ChoiceLogprobs):
+                logprobs_list: list[list[tuple[str, float]]] = [
+                    [
+                        (top_logprob.token, top_logprob.logprob)
+                        for top_logprob in content.top_logprobs
+                    ]
+                    for content in model_response_choices.logprobs.content or list()
                 ]
-                for content in model_response_choices.logprobs.content or list()
-            ]
-            model_output.scores = [logprobs_list]
+                model_output.scores = [logprobs_list]
+            else:
+                log_once(
+                    "The logprobs object is malformed, so we won't use logprobs to "
+                    "determine the labels.",
+                    level=logging.WARNING,
+                )
         return model_output
@@ -403,7 +412,7 @@ class LiteLLMModel(BenchmarkModule):
         # get the number of parameters from the Hugging Face model configuration from
         # the Hugging Face Hub
         if self.model_config.model_id.startswith("huggingface/"):
-            model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
+            model_id = "/".join(self.model_config.model_id.split(sep="/")[-2:])
             if HuggingFaceEncoderModel.model_exists(
                 model_id=model_id, benchmark_config=self.benchmark_config
             ):
@@ -467,7 +476,7 @@ class LiteLLMModel(BenchmarkModule):
         # get the vocabulary size from the Hugging Face model configuration from the
         # Hugging Face Hub
         if self.model_config.model_id.startswith("huggingface/"):
-            model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
+            model_id = "/".join(self.model_config.model_id.split(sep="/")[-2:])
             if HuggingFaceEncoderModel.model_exists(
                 model_id=model_id, benchmark_config=self.benchmark_config
             ):
@@ -547,7 +556,7 @@ class LiteLLMModel(BenchmarkModule):
         # get the maximum length from the Hugging Face model configuration from the
         # Hugging Face Hub
         if self.model_config.model_id.startswith("huggingface/"):
-            model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
+            model_id = "/".join(self.model_config.model_id.split(sep="/")[-2:])
             if HuggingFaceEncoderModel.model_exists(
                 model_id=model_id, benchmark_config=self.benchmark_config
             ):
@@ -688,42 +697,11 @@ class LiteLLMModel(BenchmarkModule):
         if model_id in litellm.model_list:
             return True
-        # If it is an Ollama model then try to download it
+        # Separate check for Ollama models
         if model_id.startswith("ollama/") or model_id.startswith("ollama_chat/"):
-            ollama_model_id = "/".join(model_id.split("/")[1:])
-            downloaded_ollama_models: list[str] = [
-                model_obj.model
-                for model_obj in ollama.list().models
-                if model_obj.model is not None
-            ]
-            if ollama_model_id not in downloaded_ollama_models:
-                try:
-                    response = ollama.pull(model=ollama_model_id, stream=True)
-                    with tqdm(
-                        desc=f"Downloading {ollama_model_id}",
-                        unit_scale=True,
-                        unit="B",
-                        leave=False,
-                    ) as pbar:
-                        for status in response:
-                            if status.total is not None:
-                                pbar.total = status.total
-                            if status.completed is not None:
-                                pbar.update(status.completed - pbar.n)
-                except ollama.ResponseError as e:
-                    if "file does not exist" in str(e).lower():
-                        return False
-                    else:
-                        raise InvalidModel(
-                            f"Failed to download Ollama model {ollama_model_id}. The "
-                            f"error message was: {e}"
-                        )
-            else:
-                log_once(
-                    f"Ollama model {ollama_model_id!r} already downloaded, so skipping "
-                    "download.",
-                    level=logging.DEBUG,
-                )
+            ollama_model_exists = try_download_ollama_model(model_id=model_id)
+            if ollama_model_exists:
+                return ollama_model_exists
         num_attempts = 10
         for _ in range(num_attempts):
@@ -737,6 +715,10 @@ class LiteLLMModel(BenchmarkModule):
                     api_version=benchmark_config.api_version,
                 )
                 return True
+            # A rate limit indicates that the model *does* exist, but we are being rate
+            # limited.
+            except RateLimitError:
+                return True
             except (
                 APIConnectionError,
                 Timeout,
@@ -748,12 +730,6 @@ class LiteLLMModel(BenchmarkModule):
                     "Retrying in 10 seconds..."
                 )
                 sleep(5)
-            except RateLimitError:
-                logger.warning(
-                    f"Rate limit exceeded for model {model_id!r}. Retrying in 10 "
-                    "seconds..."
-                )
-                sleep(10)
             except APIError as e:
                 if "'503 Service Unavailable" not in str(e):
                     raise e
@@ -1155,3 +1131,91 @@ def raise_if_wrong_params(
                     msg += " No parameters are allowed."
                 raise InvalidModel(msg)
             return
+def try_download_ollama_model(model_id: str) -> bool:
+    """Try to download an Ollama model.
+    Args:
+        model_id:
+            The model ID. If the model does not start with "ollama/" or "ollama_chat/"
+            then this function will return False.
+    Returns:
+        Whether the model was downloaded successfully.
+    """
+    if not (model_id.startswith("ollama/") or model_id.startswith("ollama_chat/")):
+        return False
+    if model_id.startswith("ollama/"):
+        log_once(
+            "You're trying to benchmark a model with the old 'ollama/' prefix, which "
+            "probably results in bad performance, as it doesn't use the model's chat "
+            "template. If the model is not a chat model then just disregard this "
+            "warning, but if it is a chat model then please cancel this run and "
+            "use the 'ollama_chat/' prefix instead.",
+            level=logging.WARNING,
+        )
+    downloaded_ollama_models: list[str] = [
+        model_obj.model
+        for model_obj in ollama.list().models
+        if model_obj.model is not None
+    ]
+    ollama_model_id = "/".join(model_id.split("/")[1:])
+    if ollama_model_id not in downloaded_ollama_models:
+        # Try fetching the model info
+        try:
+            response = ollama.pull(model=ollama_model_id, stream=True)
+        except ollama.ResponseError as e:
+            if "file does not exist" in str(e).lower():
+                # Check if the model exists if we prepend "hf.co/"
+                try:
+                    ollama_model_id_with_prefix = f"hf.co/{ollama_model_id}"
+                    model_id_with_prefix = (
+                        f"{model_id.split('/')[0]}/{ollama_model_id_with_prefix}"
+                    )
+                    ollama.pull(model=ollama_model_id_with_prefix, stream=True)
+                    log_once(
+                        f"The model {model_id!r} cannot be found on Ollama, but the "
+                        f"model {model_id_with_prefix} *was* found, so we would "
+                        "recommend you cancelling this run and trying the evaluation "
+                        "with that model ID instead."
+                    )
+                    return False
+                except ollama.ResponseError as inner_e:
+                    if "file does not exist" in str(inner_e).lower():
+                        return False
+                    else:
+                        raise InvalidModel(
+                            f"Failed to download Ollama model {ollama_model_id}. "
+                            f"The error message was: {inner_e}"
+                        )
+            else:
+                raise InvalidModel(
+                    f"Failed to download Ollama model {ollama_model_id}. "
+                    f"The error message was: {e}"
+                )
+        # Download the model
+        with tqdm(
+            desc=f"Downloading {ollama_model_id}",
+            unit_scale=True,
+            unit="B",
+            leave=False,
+        ) as pbar:
+            for status in response:
+                if status.total is not None:
+                    pbar.total = status.total
+                if status.completed is not None:
+                    pbar.update(status.completed - pbar.n)
+        return True
+    else:
+        log_once(
+            f"Ollama model {ollama_model_id!r} already downloaded, so skipping "
+            "download.",
+            level=logging.DEBUG,
+        )
+        return True

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Generative models using the vLLM inference framework."""
 import collections.abc as c
+import contextlib
 import importlib.util
 import itertools as it
 import json
@@ -20,7 +21,10 @@ from datasets import DatasetDict
 from huggingface_hub import snapshot_download
 from pydantic import conlist, create_model
 from tqdm.auto import tqdm
-from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, Trainer
+from transformers.models.auto.configuration_auto import AutoConfig
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.trainer import Trainer
 from urllib3.exceptions import RequestError
 from ..constants import (
@@ -53,40 +57,39 @@ from ..exceptions import (
     NeedsExtraInstalled,
 )
 from ..languages import get_all_languages
-from ..task_utils import (
+from ..task_group_utils import (
     question_answering,
     sequence_classification,
     text_to_text,
     token_classification,
 )
-from ..types import ExtractLabelsFunction
-from ..utils import (
-    clear_memory,
-    create_model_cache_dir,
+from ..tokenization_utils import (
     get_bos_token,
     get_end_of_chat_token_ids,
     get_eos_token,
     get_first_label_token_mapping,
+    should_prompts_be_stripped,
+)
+from ..types import ExtractLabelsFunction
+from ..utils import (
+    clear_memory,
+    create_model_cache_dir,
     get_min_cuda_compute_capability,
     log_once,
-    should_prompts_be_stripped,
 )
 from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
 if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
     from vllm import LLM, RequestOutput, SamplingParams
+    from vllm.distributed.parallel_state import (
+        destroy_distributed_environment,
+        destroy_model_parallel,
+    )
     from vllm.lora.request import LoRARequest
-    try:
-        from vllm.model_executor.parallel_utils.parallel_state import (
-            destroy_model_parallel,
-        )
-    except ImportError:
-        from vllm.distributed.parallel_state import destroy_model_parallel
 if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
     from outlines.models.vllm import adapt_tokenizer
-    from outlines.processors import JSONLogitsProcessor
+    from outlines.processors.structured import JSONLogitsProcessor
 if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
     import ray
@@ -156,6 +159,14 @@ class VLLMModel(HuggingFaceEncoderModel):
                 lora_name="adapter", lora_int_id=1, lora_path=adapter_path
             )
+    def __del__(self) -> None:
+        """Clean up the model and tokenizer."""
+        clear_vllm()
+        if hasattr(self, "_model"):
+            del self._model
+        if hasattr(self, "_tokenizer"):
+            del self._tokenizer
     @property
     def generative_type(self) -> GenerativeType | None:
         """Get the generative type of the model.
@@ -330,7 +341,7 @@ class VLLMModel(HuggingFaceEncoderModel):
             pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
             logits_processor = JSONLogitsProcessor(
                 schema=pydantic_class,
-                tokenizer=adapt_tokenizer(tokenizer=self._tokenizer),  #  type: ignore
+                tokenizer=adapt_tokenizer(tokenizer=self._tokenizer),  # type: ignore
                 whitespace_pattern=r" ?",
             )
             log_once(
@@ -982,19 +993,19 @@ def load_model_and_tokenizer(
     clear_vllm()
-    executor_backend = "ray" if torch.cuda.device_count() > 1 else "mp"
     try:
         model = LLM(
             model=model_id,
             tokenizer=model_id,
-            gpu_memory_utilization=0.95,
+            gpu_memory_utilization=0.9,
             max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
             download_dir=download_dir,
             trust_remote_code=benchmark_config.trust_remote_code,
             revision=revision,
             seed=4242,
-            distributed_executor_backend=executor_backend,
+            distributed_executor_backend=(
+                "ray" if torch.cuda.device_count() > 1 else "mp"
+            ),
             tensor_parallel_size=torch.cuda.device_count(),
             disable_custom_all_reduce=True,
             quantization=quantization,
@@ -1145,13 +1156,16 @@ def _run_engine_with_fixed_progress_bars(
 def clear_vllm() -> None:
     """Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
-    try:
+    with contextlib.suppress(ValueError):
         destroy_model_parallel()
-    except ImportError:
-        pass
-    clear_memory()
+        destroy_distributed_environment()
     if ray.is_initialized():
         ray.shutdown()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    if ray.is_initialized():
+        ray.shutdown()
+    clear_memory()
 def get_end_of_reasoning_token_id(
@@ -1175,12 +1189,13 @@ def get_end_of_reasoning_token_id(
     if tokenizer.chat_template is None:
         prompt = "What is your name?"
     else:
-        prompt = tokenizer.apply_chat_template(
+        templated_prompt = tokenizer.apply_chat_template(
             conversation=[dict(role="user", content="What is your name?")],
             add_generation_prompt=True,
             tokenize=False,
         )
-    assert isinstance(prompt, str)
+        assert isinstance(templated_prompt, str)
+        prompt = templated_prompt
     # Generate a completion and remove the BOS token from it, to not confuse it with the
     # potential reasoning token

EuroEval 15.5.0__py3-none-any.whl → 15.6.1__py3-none-any.whl

Potentially problematic release.

EuroEval 15.5.0py3-none-any.whl → 15.6.1py3-none-any.whl