PyPI - EuroEval - Versions diffs - 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl - Mend

EuroEval 15.16.0py3-none-any.whl → 16.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show

euroeval/__init__.py +8 -7
euroeval/benchmark_config_factory.py +3 -7
euroeval/benchmark_modules/base.py +35 -19
euroeval/benchmark_modules/fresh.py +24 -19
euroeval/benchmark_modules/hf.py +136 -154
euroeval/benchmark_modules/litellm.py +190 -110
euroeval/benchmark_modules/vllm.py +199 -139
euroeval/benchmarker.py +49 -22
euroeval/cli.py +3 -3
euroeval/constants.py +19 -15
euroeval/data_loading.py +33 -28
euroeval/data_models.py +73 -23
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/danish.py +35 -1
euroeval/dataset_configs/dutch.py +38 -1
euroeval/dataset_configs/english.py +38 -1
euroeval/dataset_configs/estonian.py +95 -0
euroeval/dataset_configs/faroese.py +38 -0
euroeval/dataset_configs/finnish.py +39 -1
euroeval/dataset_configs/french.py +38 -1
euroeval/dataset_configs/german.py +38 -1
euroeval/dataset_configs/icelandic.py +39 -1
euroeval/dataset_configs/italian.py +38 -1
euroeval/dataset_configs/latvian.py +81 -0
euroeval/dataset_configs/norwegian.py +38 -1
euroeval/dataset_configs/portuguese.py +38 -1
euroeval/dataset_configs/spanish.py +38 -1
euroeval/dataset_configs/swedish.py +38 -1
euroeval/enums.py +0 -6
euroeval/finetuning.py +6 -6
euroeval/generation.py +25 -14
euroeval/generation_utils.py +90 -20
euroeval/languages.py +947 -187
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +76 -0
euroeval/metrics/huggingface.py +192 -0
euroeval/metrics/llm_as_a_judge.py +257 -0
euroeval/metrics/pipeline.py +276 -0
euroeval/metrics/speed.py +51 -0
euroeval/model_cache.py +13 -1
euroeval/prompt_templates/linguistic_acceptability.py +40 -2
euroeval/prompt_templates/multiple_choice.py +23 -2
euroeval/prompt_templates/named_entity_recognition.py +65 -2
euroeval/prompt_templates/reading_comprehension.py +42 -2
euroeval/prompt_templates/sentiment_classification.py +46 -2
euroeval/prompt_templates/summarization.py +24 -4
euroeval/scores.py +7 -2
euroeval/speed_benchmark.py +6 -6
euroeval/task_group_utils/multiple_choice_classification.py +19 -8
euroeval/task_group_utils/question_answering.py +35 -28
euroeval/task_group_utils/sequence_classification.py +128 -42
euroeval/task_group_utils/text_to_text.py +7 -3
euroeval/task_group_utils/token_classification.py +59 -73
euroeval/tasks.py +33 -6
euroeval/tokenization_utils.py +294 -207
euroeval/utils.py +150 -35
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
euroeval-16.0.1.dist-info/RECORD +69 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -738
euroeval/metrics.py +0 -470
euroeval-15.16.0.dist-info/RECORD +0 -63
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -5,7 +5,6 @@ import contextlib
 import importlib.util
 import json
 import logging
-import os
 import re
 import typing as t
 from functools import partial
@@ -16,6 +15,7 @@ import torch
 from huggingface_hub import snapshot_download
 from pydantic import conlist, create_model
 from tqdm.auto import tqdm
+from transformers import MistralCommonTokenizer
 from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from urllib3.exceptions import RequestError
@@ -24,11 +24,10 @@ from ..constants import (
     CUSTOM_STOP_TOKENS,
     GENERATIVE_PIPELINE_TAGS,
     MAX_CONTEXT_LENGTH,
-    MAX_LOGPROBS,
+    MAX_VLLM_LOGPROBS,
     MERGE_TAGS,
     REASONING_MAX_TOKENS,
     REASONING_TOKENS,
-    TASKS_USING_JSON,
     VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
 )
 from ..data_models import GenerativeModelOutput, ModelConfig
@@ -54,17 +53,20 @@ from ..task_group_utils import (
     token_classification,
 )
 from ..tokenization_utils import (
+    apply_chat_template,
     get_bos_token,
     get_end_of_chat_token_ids,
     get_eos_token,
     get_first_label_token_mapping,
     get_pad_token,
+    has_chat_template,
     should_prompts_be_stripped,
 )
 from ..types import ExtractLabelsFunction
 from ..utils import (
     clear_memory,
     create_model_cache_dir,
+    get_hf_token,
     get_min_cuda_compute_capability,
     log_once,
 )
@@ -79,9 +81,6 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
     from vllm.lora.request import LoRARequest
     from vllm.sampling_params import GuidedDecodingParams
-if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
-    import ray
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
     from transformers.tokenization_utils import PreTrainedTokenizer
@@ -104,6 +103,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         model_config: "ModelConfig",
         dataset_config: "DatasetConfig",
         benchmark_config: "BenchmarkConfig",
+        log_metadata: bool = True,
     ) -> None:
         """Initialise the vLLM model.
@@ -114,27 +114,26 @@ class VLLMModel(HuggingFaceEncoderModel):
                 The dataset configuration.
             benchmark_config:
                 The benchmark configuration.
+            log_metadata:
+                Whether to log the model and dataset metadata.
         """
-        if (
-            importlib.util.find_spec("vllm") is None
-            or importlib.util.find_spec("ray") is None
-        ):
+        if importlib.util.find_spec("vllm") is None:
             raise NeedsExtraInstalled(extra="generative")
-        model, tokenizer = load_model_and_tokenizer(
+        model, tokeniser = load_model_and_tokeniser(
             model_config=model_config, benchmark_config=benchmark_config
         )
         self._model: "LLM" = model
-        self._tokenizer: "PreTrainedTokenizer" = tokenizer
+        self._tokeniser: "PreTrainedTokenizer" = tokeniser
         self.end_of_reasoning_token = get_end_of_reasoning_token(
-            model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
+            model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
         )
         self.end_of_chat_token_ids = get_end_of_chat_token_ids(
-            tokenizer=self._tokenizer
+            tokeniser=self._tokeniser
         )
         self.custom_stop_tokens = get_custom_stop_tokens(
             model=self._model,
-            tokenizer=self._tokenizer,
+            tokeniser=self._tokeniser,
             model_id=model_config.model_id,
             is_reasoning_model=self.end_of_reasoning_token is not None,
         )
@@ -145,15 +144,17 @@ class VLLMModel(HuggingFaceEncoderModel):
             model_config=model_config,
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
+            log_metadata=log_metadata,
         )
         self.buffer |= dict(
-            instruction_model=self._tokenizer.chat_template is not None,
+            instruction_model=has_chat_template(tokeniser=self._tokeniser),
             first_label_token_mapping=get_first_label_token_mapping(
                 dataset_config=self.dataset_config,
                 model_config=self.model_config,
-                tokenizer=self._tokenizer,
+                tokeniser=self._tokeniser,
                 generative_type=self.generative_type,
+                log_metadata=self.log_metadata,
             ),
         )
         if self.model_config.adapter_base_model_id is not None:
@@ -167,13 +168,16 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
     def __del__(self) -> None:
-        """Clean up the model and tokenizer."""
-        if importlib.util.find_spec("vllm") is not None:
-            clear_vllm()
+        """Clean up the model and tokeniser."""
+        try:
+            if importlib.util.find_spec("vllm") is not None:
+                clear_vllm()
+        except ImportError:
+            pass
         if hasattr(self, "_model"):
             del self._model
-        if hasattr(self, "_tokenizer"):
-            del self._tokenizer
+        if hasattr(self, "_tokeniser"):
+            del self._tokeniser
     @property
     def generative_type(self) -> GenerativeType | None:
@@ -182,12 +186,12 @@ class VLLMModel(HuggingFaceEncoderModel):
         Returns:
             The generative type of the model, or None if it has not been set yet.
         """
-        if not hasattr(self, "_tokenizer"):
+        if not hasattr(self, "_tokeniser"):
             return None
         elif self.end_of_reasoning_token is not None:
             return GenerativeType.REASONING
         elif (
-            self._tokenizer.chat_template is not None
+            has_chat_template(tokeniser=self._tokeniser)
             or "instruct" in self.model_config.model_id.lower()
         ):
             return GenerativeType.INSTRUCTION_TUNED
@@ -267,7 +271,10 @@ class VLLMModel(HuggingFaceEncoderModel):
         if self.benchmark_config.few_shot:
             few_shot_examples = extract_few_shot_examples(
-                dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
+                dataset=dataset,
+                dataset_config=self.dataset_config,
+                benchmark_config=self.benchmark_config,
+                itr_idx=itr_idx,
             )
         else:
             few_shot_examples = list()
@@ -280,7 +287,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                 dataset_config=self.dataset_config,
                 instruction_model=self.buffer["instruction_model"],
                 always_populate_text_field=True,
-                tokenizer=self._tokenizer,
+                tokeniser=self._tokeniser,
             ),
             batched=True,
             load_from_cache_file=False,
@@ -298,66 +305,100 @@ class VLLMModel(HuggingFaceEncoderModel):
         Returns:
             The generated model outputs.
+        Raises:
+            InvalidBenchmark:
+                If the dataset requires logprobs, but we could not get the first token
+                of each label in the dataset.
         """
         # Get stopping tokens
         stop_tokens: list[str] = self.custom_stop_tokens.copy()
         if self.buffer["instruction_model"] is False:
             stop_tokens.append("\n\n")
-        if self._tokenizer.pad_token_id is not None:
-            assert isinstance(self._tokenizer.pad_token, str), (
+        if self._tokeniser.pad_token_id is not None:
+            assert isinstance(self._tokeniser.pad_token, str), (
                 f"The pad token for the model {self.model_config.model_id!r} "
-                f"is not a string, which is unexpected: {self._tokenizer.pad_token!r}."
+                f"is not a string, which is unexpected: {self._tokeniser.pad_token!r}."
             )
-            stop_tokens.append(self._tokenizer.pad_token)
-        if self._tokenizer.eos_token_id is not None:
-            assert isinstance(self._tokenizer.eos_token, str), (
+            stop_tokens.append(self._tokeniser.pad_token)
+        if self._tokeniser.eos_token_id is not None:
+            assert isinstance(self._tokeniser.eos_token, str), (
                 f"The EOS token for the model {self.model_config.model_id!r} "
-                f"is not a string, which is unexpected: {self._tokenizer.eos_token!r}."
+                f"is not a string, which is unexpected: {self._tokeniser.eos_token!r}."
             )
-            stop_tokens.append(self._tokenizer.eos_token)
-            if self._tokenizer.pad_token_id is None:
-                self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
-                self._tokenizer.pad_token = self._tokenizer.eos_token
+            stop_tokens.append(self._tokeniser.eos_token)
+            if self._tokeniser.pad_token_id is None:
+                self._tokeniser.pad_token_id = self._tokeniser.eos_token_id
+                self._tokeniser.pad_token = self._tokeniser.eos_token
         if self.end_of_chat_token_ids is not None:
-            end_of_chat_token = self._tokenizer.decode(
+            end_of_chat_token = self._tokeniser.decode(
                 self.end_of_chat_token_ids
             ).strip()
             if end_of_chat_token:
                 stop_tokens.append(end_of_chat_token)
-        structured_generation_schema = None
-        if self.dataset_config.task in TASKS_USING_JSON:
-            if self.generative_type == GenerativeType.REASONING:
-                log_once(
-                    f"The model {self.model_config.model_id!r} is a reasoning model "
-                    "and thus does not support structured generation, so we do not "
-                    "enable it.",
-                    level=logging.DEBUG,
-                )
-            else:
-                ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
-                keys_and_their_types: dict[str, t.Any] = {
-                    tag_name: (conlist(str, max_length=5), ...)
-                    for tag_name in ner_tag_names
-                }
-                answer_format_class = create_model(
-                    "AnswerFormat", **keys_and_their_types
-                )
-                structured_generation_schema = answer_format_class.model_json_schema()
-                log_once(
-                    "Using structured generation with the JSON schema "
-                    f"{structured_generation_schema}",
-                    level=logging.DEBUG,
-                )
         # Get the mapping from labels to the first token in the label. We call this each
         # time we generate a new dataset since the dataset config can change
         self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
             dataset_config=self.dataset_config,
             model_config=self.model_config,
-            tokenizer=self._tokenizer,
+            tokeniser=self._tokeniser,
             generative_type=self.generative_type,
+            log_metadata=self.log_metadata,
         )
+        if (
+            not self.buffer["first_label_token_mapping"]
+            and self.dataset_config.task.requires_logprobs
+        ):
+            raise InvalidBenchmark(
+                "The dataset requires logprobs, but we encountered an error when "
+                "trying to get the first token of each label in the dataset. You can "
+                "try running this benchmark with the --verbose flag to see what the "
+                "error was. Skipping this evaluation."
+            )
+        structured_generation_schema = None
+        if (
+            self.dataset_config.task.uses_structured_output
+            or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
+        ) and self.generative_type == GenerativeType.REASONING:
+            guided_decoding = None
+            logger.debug(
+                "The dataset uses structured output, but we are not using it as the "
+                "model is a reasoning model."
+            )
+        elif self.dataset_config.task.uses_structured_output:
+            ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
+            keys_and_their_types: dict[str, t.Any] = {
+                tag_name: (conlist(str, max_length=5), ...)
+                for tag_name in ner_tag_names
+            }
+            answer_format_class = create_model("AnswerFormat", **keys_and_their_types)
+            structured_generation_schema = answer_format_class.model_json_schema()
+            log_once(
+                "Using structured generation with the JSON schema: "
+                f"{json.dumps(structured_generation_schema)}",
+                level=logging.DEBUG,
+            )
+            guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
+        elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
+            guided_decoding = GuidedDecodingParams(
+                choice=[
+                    self.dataset_config.prompt_label_mapping[label]
+                    for label in self.dataset_config.labels
+                ]
+            )
+            log_once(
+                "Using structured generation with the choices: "
+                f"{guided_decoding.choice!r}.",
+                level=logging.DEBUG,
+            )
+        else:
+            guided_decoding = None
+            log_once(
+                "Not using structured generation as the dataset does not require it.",
+                level=logging.DEBUG,
+            )
         # Define the parameters used for vLLM generation
         max_tokens: int = (
@@ -367,14 +408,12 @@ class VLLMModel(HuggingFaceEncoderModel):
         )
         sampling_params = SamplingParams(
             max_tokens=max_tokens,
-            logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
+            logprobs=MAX_VLLM_LOGPROBS
+            if self.buffer["first_label_token_mapping"]
+            else None,
             temperature=0.0,
             stop=[stop_token for stop_token in stop_tokens if stop_token],
-            guided_decoding=(
-                GuidedDecodingParams(json=structured_generation_schema)
-                if structured_generation_schema
-                else None
-            ),
+            guided_decoding=guided_decoding,
         )
         # If any of the prompts are empty then we need to replace them with a BOS token
@@ -383,7 +422,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         if any(len(prompt) == 0 for prompt in prompts):
             logger.debug("Found empty prompts, replacing with BOS token.")
             prompts = [
-                prompt if len(prompt) > 0 else str(self._tokenizer.bos_token)
+                prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
                 for prompt in prompts
             ]
@@ -394,7 +433,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         if not self.buffer.get(
             "instruction_model", False
         ) and should_prompts_be_stripped(
-            labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
+            labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
         ):
             log_once(
                 f"Stripping prompts for model {self.model_config.model_id!r}.",
@@ -405,6 +444,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         # Generate sequences using vLLM
         input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
         num_attempts = 3
+        truncation_attempts = 0
         for _ in range(num_attempts):
             try:
                 raw_outputs = self._model.generate(
@@ -432,22 +472,29 @@ class VLLMModel(HuggingFaceEncoderModel):
                         "Prompts are too long, so truncating them and trying again..."
                     )
                     logger.debug(f"The error message was: {str(e)}")
-                    tokenized_prompts = self._tokenizer(
+                    # If we have already tried truncating the prompts a few times, then
+                    # we truncate a bit more aggressively
+                    extra_truncation = 50 * truncation_attempts
+                    truncation_attempts += 1
+                    tokenized_prompts = self._tokeniser(
                         text=prompts,
                         truncation=True,
                         max_length=max(
-                            min(self._tokenizer.model_max_length, MAX_CONTEXT_LENGTH)
-                            - max_tokens,
+                            min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
+                            - max_tokens
+                            - extra_truncation,
                             0,
                         ),
                     )
-                    prompts = self._tokenizer.batch_decode(
+                    prompts = self._tokeniser.batch_decode(
                         sequences=tokenized_prompts.input_ids, skip_special_tokens=True
                     )
                 else:
                     raise InvalidBenchmark(
                         f"An error occurred during vLLM generation: {str(e)}"
-                    )
+                    ) from e
         else:
             raise InvalidBenchmark(
                 f"Could not generate sequences after {num_attempts} attempts."
@@ -477,7 +524,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         completion_ids: list[list[int]] = [
             output.outputs[0].token_ids for output in raw_outputs
         ]
-        completions = self._tokenizer.batch_decode(
+        completions = self._tokeniser.batch_decode(
             sequences=[
                 torch.LongTensor(completion_id) for completion_id in completion_ids
             ]
@@ -625,10 +672,10 @@ class VLLMModel(HuggingFaceEncoderModel):
         )
-def load_model_and_tokenizer(
+def load_model_and_tokeniser(
     model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
 ) -> tuple["LLM", "PreTrainedTokenizer"]:
-    """Load the model and tokenizer.
+    """Load the model and tokeniser.
     Args:
         model_config:
@@ -637,7 +684,7 @@ def load_model_and_tokenizer(
             The benchmark configuration.
     Returns:
-        A pair (model, tokenizer), with the loaded model and tokenizer
+        A pair (model, tokeniser), with the loaded model and tokeniser
     """
     # Prefer base model ID if the model is an adapter - the adapter will be added on
     # during inference in this case
@@ -675,7 +722,7 @@ def load_model_and_tokenizer(
     dtype: str | torch.dtype = "auto"
     # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
-    if hf_model_config.torch_dtype == torch.float32:
+    if hf_model_config.dtype == torch.float32:
         if torch.cuda.is_bf16_supported():
             logger.info(
                 "You are loading a model with dtype FP32, which we will convert to "
@@ -692,34 +739,32 @@ def load_model_and_tokenizer(
             dtype = torch.float16
     # If the model is a quantized model, we might need to change the dtype
-    if quantization == "mxfp4" and hf_model_config.torch_dtype is None:
+    if quantization == "mxfp4" and hf_model_config.dtype is None:
         dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
         logger.debug(
-            "You are loading a quantized model where `torch_dtype` has not been set. "
+            "You are loading a quantized model where `dtype` has not been set. "
             f"Setting dtype to {dtype!r}."
         )
-    elif quantization is not None and hf_model_config.torch_dtype != torch.float16:
+    elif quantization is not None and hf_model_config.dtype != torch.float16:
         logger.info(
             "You are loading a quantized model with dtype "
-            f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting "
+            f"{hf_model_config.dtype}, which vLLM does not support. Setting "
             "dtype to float16 instead."
         )
         dtype = torch.float16
     # If the model is a bf16 model, we need to check the CUDA compute capability
-    if hf_model_config.torch_dtype == torch.bfloat16:
+    if hf_model_config.dtype == torch.bfloat16:
         min_cuda_compute_capability = get_min_cuda_compute_capability()
         required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
         if min_cuda_compute_capability is not None:
             if min_cuda_compute_capability < required_capability:
                 logger.info(
-                    "You are loading a model with "
-                    f"dtype {hf_model_config.torch_dtype}, "
-                    "which vLLM only supports for CUDA devices with"
-                    f"CUDA compute capability >={required_capability}. "
-                    "You are using one or more devices with "
-                    f"compute capability {min_cuda_compute_capability}. "
+                    f"You are loading a model with dtype {hf_model_config.dtype}, "
+                    "which vLLM only supports for CUDA devices with CUDA compute "
+                    f"capability >={required_capability}. You are using one or more "
+                    f"devices with compute capability {min_cuda_compute_capability}. "
                     "Setting dtype to float16 instead."
                 )
                 dtype = torch.float16
@@ -747,14 +792,14 @@ def load_model_and_tokenizer(
     else:
         true_max_model_len = MAX_CONTEXT_LENGTH
-    tokenizer = load_tokenizer(
+    tokeniser = load_tokeniser(
         model_id=model_config.model_id,
         revision=model_config.revision,
         adapter_base_model_id=model_config.adapter_base_model_id,
         trust_remote_code=benchmark_config.trust_remote_code,
         model_max_length=true_max_model_len,
         model_cache_dir=model_config.model_cache_dir,
-        token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+        token=get_hf_token(api_key=benchmark_config.api_key),
     )
     clear_vllm()
@@ -769,9 +814,7 @@ def load_model_and_tokenizer(
             trust_remote_code=benchmark_config.trust_remote_code,
             revision=revision,
             seed=4242,
-            distributed_executor_backend=(
-                "ray" if torch.cuda.device_count() > 1 else "mp"
-            ),
+            distributed_executor_backend="mp",
             tensor_parallel_size=torch.cuda.device_count(),
             disable_custom_all_reduce=True,
             quantization=quantization,
@@ -782,29 +825,39 @@ def load_model_and_tokenizer(
             enable_prefix_caching=False,
             enable_lora=model_config.adapter_base_model_id is not None,
             max_lora_rank=256,
+            # Special arguments in case we are dealing with a Mistral model
+            tokenizer_mode="mistral"
+            if isinstance(tokeniser, MistralCommonTokenizer)
+            else "auto",
+            config_format="mistral"
+            if isinstance(tokeniser, MistralCommonTokenizer)
+            else "auto",
+            load_format="mistral"
+            if isinstance(tokeniser, MistralCommonTokenizer)
+            else "auto",
         )
     except (RuntimeError, ValueError, OSError) as e:
         if "awaiting a review from the repo authors" in str(e):
             raise InvalidModel(
                 f"The model {model_id!r} is awaiting a review from the repository "
                 "authors. Please try again later."
-            )
+            ) from e
         elif "trust_remote_code" in str(e):
             raise InvalidModel(
                 f"Loading the model {model_id!r} needs to trust remote code. "
                 "If you trust the suppliers of this model, then you can enable "
                 "this by setting the `--trust-remote-code` flag."
-            )
+            ) from e
         raise InvalidModel(
             f"The model {model_id!r} could not be loaded. The error was {e!r}."
-        )
+        ) from e
     model.config = hf_model_config
-    return model, tokenizer
+    return model, tokeniser
-def load_tokenizer(
+def load_tokeniser(
     model_id: str,
     revision: str,
     adapter_base_model_id: str | None,
@@ -813,7 +866,7 @@ def load_tokenizer(
     model_cache_dir: str,
     token: str | bool,
 ) -> "PreTrainedTokenizer":
-    """Load the tokenizer.
+    """Load the tokeniser.
     Args:
         model_id:
@@ -833,7 +886,7 @@ def load_tokenizer(
             The Hugging Face API token.
     Returns:
-        The loaded tokenizer.
+        The loaded tokeniser.
     """
     revision = revision if adapter_base_model_id is None else "main"
     config = AutoConfig.from_pretrained(
@@ -846,7 +899,7 @@ def load_tokenizer(
     num_retries = 5
     for _ in range(num_retries):
         try:
-            tokenizer = AutoTokenizer.from_pretrained(
+            tokeniser = AutoTokenizer.from_pretrained(
                 model_id,
                 use_fast=True,
                 verbose=False,
@@ -861,30 +914,45 @@ def load_tokenizer(
         except (json.JSONDecodeError, OSError, TypeError) as e:
             if adapter_base_model_id is None or model_id == adapter_base_model_id:
                 raise InvalidModel(
-                    f"Could not load tokenizer for model {model_id!r}. The error was "
+                    f"Could not load tokeniser for model {model_id!r}. The error was "
                     f"{str(e)}."
-                )
+                ) from e
             logger.debug(
-                f"Could not load tokenizer for {model_id!r}. Falling back to "
+                f"Could not load tokeniser for {model_id!r}. Falling back to "
                 f"{adapter_base_model_id!r}."
             )
             model_id = adapter_base_model_id
         except (TimeoutError, RequestError):
-            logger.info(f"Couldn't load tokenizer for {model_id!r}. Retrying.")
+            logger.info(f"Couldn't load tokeniser for {model_id!r}. Retrying.")
             sleep(5)
             continue
+        except (KeyError, ValueError) as e:
+            if "mistral" in str(e).lower():
+                tokeniser = MistralCommonTokenizer.from_pretrained(
+                    model_id,
+                    padding_side="left",
+                    truncation_side="left",
+                    model_max_length=model_max_length,
+                    token=token,
+                )
+                break
+            raise InvalidModel(
+                f"Could not load tokeniser for model {model_id!r}. The error was "
+                f"{str(e)}."
+            ) from e
     else:
         raise InvalidModel(
-            f"Could not load tokenizer for model {model_id!r} after {num_retries} "
+            f"Could not load tokeniser for model {model_id!r} after {num_retries} "
             "attempts."
         )
     # Ensure that BOS, EOS and PAD tokens are set
-    tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
-    tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
-    tokenizer.pad_token, tokenizer.pad_token_id = get_pad_token(tokenizer=tokenizer)
+    if not isinstance(tokeniser, MistralCommonTokenizer):
+        tokeniser.bos_token, tokeniser.bos_token_id = get_bos_token(tokeniser=tokeniser)
+        tokeniser.eos_token, tokeniser.eos_token_id = get_eos_token(tokeniser=tokeniser)
+        tokeniser.pad_token, tokeniser.pad_token_id = get_pad_token(tokeniser=tokeniser)
-    return tokenizer
+    return tokeniser
 def clear_vllm() -> None:
@@ -892,25 +960,21 @@ def clear_vllm() -> None:
     with contextlib.suppress(ValueError):
         destroy_model_parallel()
         destroy_distributed_environment()
-    if ray.is_initialized():
-        ray.shutdown()
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
-    if ray.is_initialized():
-        ray.shutdown()
     clear_memory()
 def get_end_of_reasoning_token(
-    model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
+    model: "LLM", tokeniser: "PreTrainedTokenizer", model_id: str
 ) -> str | None:
     """Get the end-of-reasoning token for a generative model.
     Args:
         model:
             The vLLM model.
-        tokenizer:
-            The tokenizer.
+        tokeniser:
+            The tokeniser.
         model_id:
             The model ID.
@@ -919,11 +983,9 @@ def get_end_of_reasoning_token(
     """
     # Create a prompt to check if the model uses the reasoning tokens
     prompt = "What is your name?"
-    if tokenizer.chat_template is not None:
-        templated_prompt = tokenizer.apply_chat_template(
-            conversation=[dict(role="user", content=prompt)],
-            add_generation_prompt=True,
-            tokenize=False,
+    if has_chat_template(tokeniser=tokeniser):
+        templated_prompt = apply_chat_template(
+            conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
         )
         assert isinstance(templated_prompt, str)
         prompt = templated_prompt
@@ -948,7 +1010,7 @@ def get_end_of_reasoning_token(
             f"The model {model_id!r} did not generate any beginning-of-reasoning "
             "tokens in the prompt or the completion. Assuming the model is not "
             "a reasoning model.",
-            level=logging.INFO,
+            level=logging.DEBUG,
         )
         return None
@@ -974,7 +1036,7 @@ def get_end_of_reasoning_token(
             "the beginning-of-reasoning tokens "
             f"{[bor_token for bor_token, _ in bor_reasoning_matches]!r}. "
             "This is probably not correct, so please report this issue.",
-            level=logging.INFO,
+            level=logging.WARNING,
         )
         return None
@@ -984,14 +1046,14 @@ def get_end_of_reasoning_token(
             f"model {model_id!r}. Using {eor_reasoning_matches[0]!r} as "
             "the reasoning token. If this is not the correct reasoning token, "
             "please report this issue.",
-            level=logging.INFO,
+            level=logging.WARNING,
         )
     bor_token, eor_token = eor_reasoning_matches[0]
     log_once(
         f"Detected beginning-of-reasoning token {bor_token!r} and end-of-reasoning "
         f"token {eor_token!r} for model {model_id!r}.",
-        level=logging.INFO,
+        level=logging.DEBUG,
     )
     return eor_token
@@ -999,7 +1061,7 @@ def get_end_of_reasoning_token(
 def get_custom_stop_tokens(
     model: "LLM",
-    tokenizer: "PreTrainedTokenizer",
+    tokeniser: "PreTrainedTokenizer",
     model_id: str,
     is_reasoning_model: bool,
 ) -> list[str]:
@@ -1008,8 +1070,8 @@ def get_custom_stop_tokens(
     Args:
         model:
             The vLLM model.
-        tokenizer:
-            The tokenizer.
+        tokeniser:
+            The tokeniser.
         model_id:
             The model ID.
         is_reasoning_model:
@@ -1022,11 +1084,9 @@ def get_custom_stop_tokens(
     candidate_stop_tokens = CUSTOM_STOP_TOKENS
     prompt = "Hello"
-    if tokenizer.chat_template is not None:
-        templated_prompt = tokenizer.apply_chat_template(
-            conversation=[dict(role="user", content=prompt)],
-            add_generation_prompt=True,
-            tokenize=False,
+    if has_chat_template(tokeniser=tokeniser):
+        templated_prompt = apply_chat_template(
+            conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
         )
         assert isinstance(templated_prompt, str)
         prompt = templated_prompt

EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl

Potentially problematic release.

EuroEval 15.16.0py3-none-any.whl → 16.0.1py3-none-any.whl