PyPI - EuroEval - Versions diffs - 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl - Mend

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

euroeval/__init__.py +32 -14
euroeval/benchmark_config_factory.py +92 -180
euroeval/benchmark_modules/base.py +49 -39
euroeval/benchmark_modules/fresh.py +35 -21
euroeval/benchmark_modules/hf.py +280 -244
euroeval/benchmark_modules/litellm.py +752 -312
euroeval/benchmark_modules/vllm.py +570 -268
euroeval/benchmarker.py +651 -528
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +49 -38
euroeval/constants.py +44 -25
euroeval/data_loading.py +111 -55
euroeval/data_models.py +490 -323
euroeval/dataset_configs/__init__.py +26 -4
euroeval/dataset_configs/bosnian.py +39 -0
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/croatian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +78 -50
euroeval/dataset_configs/dutch.py +74 -44
euroeval/dataset_configs/english.py +71 -36
euroeval/dataset_configs/estonian.py +111 -0
euroeval/dataset_configs/faroese.py +25 -18
euroeval/dataset_configs/finnish.py +63 -26
euroeval/dataset_configs/french.py +65 -32
euroeval/dataset_configs/german.py +77 -36
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +68 -57
euroeval/dataset_configs/italian.py +68 -36
euroeval/dataset_configs/latvian.py +87 -0
euroeval/dataset_configs/lithuanian.py +64 -0
euroeval/dataset_configs/norwegian.py +98 -72
euroeval/dataset_configs/polish.py +96 -0
euroeval/dataset_configs/portuguese.py +63 -40
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/slovene.py +56 -0
euroeval/dataset_configs/spanish.py +68 -34
euroeval/dataset_configs/swedish.py +82 -41
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/enums.py +12 -6
euroeval/exceptions.py +21 -1
euroeval/finetuning.py +34 -26
euroeval/generation.py +76 -41
euroeval/generation_utils.py +169 -34
euroeval/languages.py +1020 -188
euroeval/logging_utils.py +268 -0
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +85 -0
euroeval/metrics/huggingface.py +216 -0
euroeval/metrics/llm_as_a_judge.py +260 -0
euroeval/metrics/pipeline.py +289 -0
euroeval/metrics/speed.py +48 -0
euroeval/model_cache.py +40 -21
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +157 -22
euroeval/prompt_templates/multiple_choice.py +159 -17
euroeval/prompt_templates/named_entity_recognition.py +318 -21
euroeval/prompt_templates/reading_comprehension.py +207 -16
euroeval/prompt_templates/sentiment_classification.py +205 -22
euroeval/prompt_templates/summarization.py +122 -22
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +20 -9
euroeval/speed_benchmark.py +11 -12
euroeval/task_group_utils/multiple_choice_classification.py +21 -12
euroeval/task_group_utils/question_answering.py +101 -73
euroeval/task_group_utils/sequence_classification.py +144 -61
euroeval/task_group_utils/text_to_text.py +33 -12
euroeval/task_group_utils/token_classification.py +86 -89
euroeval/tasks.py +75 -16
euroeval/tokenisation_utils.py +603 -0
euroeval/types.py +17 -11
euroeval/utils.py +332 -137
euroeval-16.7.1.dist-info/METADATA +623 -0
euroeval-16.7.1.dist-info/RECORD +84 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -737
euroeval/metrics.py +0 -452
euroeval/tokenization_utils.py +0 -498
euroeval-15.12.0.dist-info/METADATA +0 -285
euroeval-15.12.0.dist-info/RECORD +0 -63
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -5,8 +5,8 @@ import contextlib
 import importlib.util
 import json
 import logging
-import os
 import re
+import shutil
 import typing as t
 from functools import partial
 from pathlib import Path
@@ -15,23 +15,22 @@ from time import sleep
 import torch
 from huggingface_hub import snapshot_download
 from pydantic import conlist, create_model
-from tqdm.auto import tqdm
 from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers.tokenization_mistral_common import MistralCommonTokenizer
 from urllib3.exceptions import RequestError
 from ..constants import (
     CUSTOM_STOP_TOKENS,
     GENERATIVE_PIPELINE_TAGS,
     MAX_CONTEXT_LENGTH,
-    MAX_LOGPROBS,
+    MAX_VLLM_LOGPROBS,
     MERGE_TAGS,
     REASONING_MAX_TOKENS,
     REASONING_TOKENS,
-    TASKS_USING_JSON,
     VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
 )
-from ..data_models import GenerativeModelOutput, ModelConfig
+from ..data_models import GenerativeModelOutput, HashableDict, ModelConfig
 from ..enums import (
     BatchingPreference,
     GenerativeType,
@@ -44,29 +43,41 @@ from ..exceptions import (
     InvalidModel,
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
+    NeedsSystemDependency,
+)
+from ..generation_utils import (
+    apply_prompt,
+    extract_few_shot_examples,
+    raise_if_wrong_params,
 )
-from ..generation_utils import apply_prompt, extract_few_shot_examples
 from ..languages import get_all_languages
+from ..logging_utils import get_pbar, log, log_once, no_terminal_output
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
     text_to_text,
     token_classification,
 )
-from ..tokenization_utils import (
+from ..tokenisation_utils import (
+    apply_chat_template,
     get_bos_token,
     get_end_of_chat_token_ids,
     get_eos_token,
     get_first_label_token_mapping,
     get_pad_token,
+    has_chat_template,
     should_prompts_be_stripped,
 )
 from ..types import ExtractLabelsFunction
 from ..utils import (
     clear_memory,
     create_model_cache_dir,
+    flash_attention_backend,
+    get_hf_token,
     get_min_cuda_compute_capability,
-    log_once,
+    internet_connection_available,
+    resolve_model_path,
+    split_model_id,
 )
 from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
@@ -77,13 +88,7 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
         destroy_model_parallel,
     )
     from vllm.lora.request import LoRARequest
-if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
-    from outlines.models.vllm import adapt_tokenizer
-    from outlines.processors.structured import JSONLogitsProcessor
-if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
-    import ray
+    from vllm.sampling_params import StructuredOutputsParams
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -92,7 +97,10 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig, Task
-logger = logging.getLogger("euroeval")
+MODELS_REQUIRING_FLASH_ATTENTION: list[re.Pattern] = [
+    re.compile(r".*gpt-oss.*", flags=re.IGNORECASE)
+]
 class VLLMModel(HuggingFaceEncoderModel):
@@ -101,12 +109,17 @@ class VLLMModel(HuggingFaceEncoderModel):
     fresh_model = False
     batching_preference = BatchingPreference.ALL_AT_ONCE
     high_priority = True
+    allowed_params = {
+        re.compile(r".*"): ["thinking", "no-thinking", "slow-tokenizer"],
+        re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): ["low", "medium", "high"],
+    }
     def __init__(
         self,
         model_config: "ModelConfig",
         dataset_config: "DatasetConfig",
         benchmark_config: "BenchmarkConfig",
+        log_metadata: bool = True,
     ) -> None:
         """Initialise the vLLM model.
@@ -117,30 +130,40 @@ class VLLMModel(HuggingFaceEncoderModel):
                 The dataset configuration.
             benchmark_config:
                 The benchmark configuration.
+            log_metadata:
+                Whether to log the model and dataset metadata.
         """
-        if (
-            importlib.util.find_spec("vllm") is None
-            or importlib.util.find_spec("ray") is None
-        ):
+        if importlib.util.find_spec("vllm") is None:
             raise NeedsExtraInstalled(extra="generative")
-        model, tokenizer = load_model_and_tokenizer(
-            model_config=model_config, benchmark_config=benchmark_config
+        if shutil.which("nvcc") is None:
+            raise NeedsSystemDependency(
+                dependency="nvcc",
+                instructions=(
+                    "Please install the CUDA Toolkit from "
+                    "https://developer.nvidia.com/cuda-downloads or ensure that NVCC "
+                    "is available in your PATH."
+                ),
+            )
+        raise_if_wrong_params(
+            model_config=model_config, allowed_params=self.allowed_params
         )
+        with (
+            no_terminal_output(disable=benchmark_config.verbose),
+            flash_attention_backend(
+                disabled=all(
+                    not re.search(pattern=pattern, string=model_config.model_id)
+                    for pattern in MODELS_REQUIRING_FLASH_ATTENTION
+                )
+            ),
+        ):
+            model, tokeniser = load_model_and_tokeniser(
+                model_config=model_config, benchmark_config=benchmark_config
+            )
         self._model: "LLM" = model
-        self._tokenizer: "PreTrainedTokenizer" = tokenizer
-        self.end_of_reasoning_token = get_end_of_reasoning_token(
-            model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
-        )
-        self.end_of_chat_token_ids = get_end_of_chat_token_ids(
-            tokenizer=self._tokenizer
-        )
-        self.custom_stop_tokens = get_custom_stop_tokens(
-            model=self._model,
-            tokenizer=self._tokenizer,
-            model_id=model_config.model_id,
-            is_reasoning_model=self.end_of_reasoning_token is not None,
-        )
+        self._tokeniser: "PreTrainedTokenizer" = tokeniser
         # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
         # to call the `__init__` method of the `BenchmarkModule` class.
@@ -148,16 +171,30 @@ class VLLMModel(HuggingFaceEncoderModel):
             model_config=model_config,
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
+            log_metadata=log_metadata,
+        )
+        self.end_of_reasoning_token = get_end_of_reasoning_token(
+            model=self._model, tokeniser=self._tokeniser, model_config=model_config
+        )
+        self.end_of_chat_token_ids = get_end_of_chat_token_ids(
+            tokeniser=self._tokeniser, generative_type=self.generative_type
+        )
+        self.custom_stop_tokens = get_custom_stop_tokens(
+            model=self._model,
+            tokeniser=self._tokeniser,
+            model_id=model_config.model_id,
+            generative_type=self.generative_type,
         )
         self.buffer |= dict(
-            instruction_model=self._tokenizer.chat_template is not None,
             first_label_token_mapping=get_first_label_token_mapping(
                 dataset_config=self.dataset_config,
                 model_config=self.model_config,
-                tokenizer=self._tokenizer,
+                tokeniser=self._tokeniser,
                 generative_type=self.generative_type,
-            ),
+                log_metadata=self.log_metadata,
+            )
         )
         if self.model_config.adapter_base_model_id is not None:
             adapter_path = snapshot_download(
@@ -170,12 +207,16 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
     def __del__(self) -> None:
-        """Clean up the model and tokenizer."""
-        clear_vllm()
+        """Clean up the model and tokeniser."""
+        try:
+            if importlib.util.find_spec("vllm") is not None:
+                clear_vllm()
+        except ImportError:
+            pass
         if hasattr(self, "_model"):
             del self._model
-        if hasattr(self, "_tokenizer"):
-            del self._tokenizer
+        if hasattr(self, "_tokeniser"):
+            del self._tokeniser
     @property
     def generative_type(self) -> GenerativeType | None:
@@ -184,17 +225,37 @@ class VLLMModel(HuggingFaceEncoderModel):
         Returns:
             The generative type of the model, or None if it has not been set yet.
         """
-        if not hasattr(self, "_tokenizer"):
+        if not hasattr(self, "_tokeniser"):
+            log_once(
+                "The generative type of the model has not been set yet as the "
+                "tokeniser has not been loaded.",
+                level=logging.DEBUG,
+            )
             return None
-        elif self.end_of_reasoning_token is not None:
-            return GenerativeType.REASONING
+        elif self.benchmark_config.generative_type is not None:
+            type_ = self.benchmark_config.generative_type
+        elif self.model_config.param in {"thinking"}:
+            type_ = GenerativeType.REASONING
+        elif self.model_config.param in {"no-thinking"}:
+            type_ = GenerativeType.INSTRUCTION_TUNED
+        elif (
+            hasattr(self, "end_of_reasoning_token")
+            and self.end_of_reasoning_token is not None
+        ):
+            type_ = GenerativeType.REASONING
         elif (
-            self._tokenizer.chat_template is not None
+            has_chat_template(tokeniser=self._tokeniser)
             or "instruct" in self.model_config.model_id.lower()
         ):
-            return GenerativeType.INSTRUCTION_TUNED
+            type_ = GenerativeType.INSTRUCTION_TUNED
         else:
-            return GenerativeType.BASE
+            type_ = GenerativeType.BASE
+        log_once(
+            f"Detected generative type {type_.name!r} for model "
+            f"{self.model_config.model_id!r}",
+            level=logging.DEBUG,
+        )
+        return type_
     @property
     def extract_labels_from_generation(self) -> ExtractLabelsFunction:
@@ -211,6 +272,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                 return partial(
                     sequence_classification.extract_labels_from_generation,
                     dataset_config=self.dataset_config,
+                    model_config=self.model_config,
                     first_label_token_mapping=self.buffer["first_label_token_mapping"],
                 )
             case TaskGroup.TEXT_TO_TEXT:
@@ -269,7 +331,10 @@ class VLLMModel(HuggingFaceEncoderModel):
         if self.benchmark_config.few_shot:
             few_shot_examples = extract_few_shot_examples(
-                dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
+                dataset=dataset,
+                dataset_config=self.dataset_config,
+                benchmark_config=self.benchmark_config,
+                itr_idx=itr_idx,
             )
         else:
             few_shot_examples = list()
@@ -280,9 +345,9 @@ class VLLMModel(HuggingFaceEncoderModel):
                 few_shot_examples=few_shot_examples,
                 model_config=self.model_config,
                 dataset_config=self.dataset_config,
-                instruction_model=self.buffer["instruction_model"],
+                generative_type=self.generative_type,
                 always_populate_text_field=True,
-                tokenizer=self._tokenizer,
+                tokeniser=self._tokeniser,
             ),
             batched=True,
             load_from_cache_file=False,
@@ -300,68 +365,111 @@ class VLLMModel(HuggingFaceEncoderModel):
         Returns:
             The generated model outputs.
+        Raises:
+            InvalidBenchmark:
+                If the dataset requires logprobs, but we could not get the first token
+                of each label in the dataset.
         """
         # Get stopping tokens
         stop_tokens: list[str] = self.custom_stop_tokens.copy()
-        if self.buffer["instruction_model"] is False:
+        if self.generative_type == GenerativeType.BASE:
             stop_tokens.append("\n\n")
-        if self._tokenizer.pad_token_id is not None:
-            assert isinstance(self._tokenizer.pad_token, str), (
+        if self._tokeniser.pad_token_id is not None:
+            assert isinstance(self._tokeniser.pad_token, str), (
                 f"The pad token for the model {self.model_config.model_id!r} "
-                f"is not a string, which is unexpected: {self._tokenizer.pad_token!r}."
+                f"is not a string, which is unexpected: {self._tokeniser.pad_token!r}."
             )
-            stop_tokens.append(self._tokenizer.pad_token)
-        if self._tokenizer.eos_token_id is not None:
-            assert isinstance(self._tokenizer.eos_token, str), (
+            stop_tokens.append(self._tokeniser.pad_token)
+        if self._tokeniser.eos_token_id is not None:
+            assert isinstance(self._tokeniser.eos_token, str), (
                 f"The EOS token for the model {self.model_config.model_id!r} "
-                f"is not a string, which is unexpected: {self._tokenizer.eos_token!r}."
+                f"is not a string, which is unexpected: {self._tokeniser.eos_token!r}."
             )
-            stop_tokens.append(self._tokenizer.eos_token)
-            if self._tokenizer.pad_token_id is None:
-                self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
-                self._tokenizer.pad_token = self._tokenizer.eos_token
+            stop_tokens.append(self._tokeniser.eos_token)
+            if self._tokeniser.pad_token_id is None:
+                self._tokeniser.pad_token_id = self._tokeniser.eos_token_id
+                self._tokeniser.pad_token = self._tokeniser.eos_token
         if self.end_of_chat_token_ids is not None:
-            end_of_chat_token = self._tokenizer.decode(
+            end_of_chat_token = self._tokeniser.decode(
                 self.end_of_chat_token_ids
             ).strip()
             if end_of_chat_token:
                 stop_tokens.append(end_of_chat_token)
-        logits_processor = None
-        if self.dataset_config.task in TASKS_USING_JSON:
-            if self.generative_type == GenerativeType.REASONING:
-                log_once(
-                    f"The model {self.model_config.model_id!r} is a reasoning model "
-                    "and thus does not support structured generation, so we do not "
-                    "enable it.",
-                    level=logging.DEBUG,
-                )
-            else:
-                ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
-                keys_and_their_types: dict[str, t.Any] = {
-                    tag_name: (conlist(str, max_length=5), ...)
-                    for tag_name in ner_tag_names
-                }
-                pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
-                logits_processor = JSONLogitsProcessor(
-                    schema=pydantic_class,
-                    tokenizer=adapt_tokenizer(tokenizer=self._tokenizer),  # type: ignore
-                    whitespace_pattern=r" ?",
-                )
-                log_once(
-                    "Using structured generation with the JSON schema "
-                    f"{pydantic_class.model_json_schema()}",
-                    level=logging.DEBUG,
-                )
         # Get the mapping from labels to the first token in the label. We call this each
         # time we generate a new dataset since the dataset config can change
         self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
             dataset_config=self.dataset_config,
             model_config=self.model_config,
-            tokenizer=self._tokenizer,
+            tokeniser=self._tokeniser,
             generative_type=self.generative_type,
+            log_metadata=self.log_metadata,
         )
+        if (
+            not self.buffer["first_label_token_mapping"]
+            and self.dataset_config.task.requires_logprobs
+        ):
+            raise InvalidBenchmark(
+                "The dataset requires logprobs, but we encountered an error when "
+                "trying to get the first token of each label in the dataset. You can "
+                "try running this benchmark with the --verbose flag to see what the "
+                "error was. Skipping this evaluation."
+            )
+        structured_generation_schema = None
+        if (
+            self.dataset_config.task.uses_structured_output
+            or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
+        ) and self.generative_type == GenerativeType.REASONING:
+            structured_outputs = None
+            log_once(
+                "The dataset uses structured output, but we are not using it as the "
+                f"model {self.model_config.model_id!r} is a reasoning model.",
+                level=logging.DEBUG,
+            )
+        elif self.dataset_config.task.uses_structured_output:
+            ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
+            keys_and_their_types: dict[str, t.Any] = {
+                tag_name: (conlist(str, max_length=5), ...)
+                for tag_name in ner_tag_names
+            }
+            answer_format_class = create_model("AnswerFormat", **keys_and_their_types)
+            structured_generation_schema = answer_format_class.model_json_schema()
+            log_once(
+                "Using structured generation with the JSON schema: "
+                f"{json.dumps(structured_generation_schema)}",
+                level=logging.DEBUG,
+            )
+            structured_outputs = StructuredOutputsParams(
+                json=structured_generation_schema
+            )
+        elif (
+            self.dataset_config.task.uses_logprobs
+            and self.dataset_config.labels
+            and self.buffer.get("first_label_token_mapping", False)
+        ):
+            choice_labels = [
+                self.dataset_config.prompt_label_mapping[label]
+                for label in self.dataset_config.labels
+            ]
+            if isinstance(self.buffer["first_label_token_mapping"], dict):
+                choice_labels = [
+                    self.buffer["first_label_token_mapping"][label]
+                    for label in choice_labels
+                ]
+            structured_outputs = StructuredOutputsParams(choice=choice_labels)
+            log_once(
+                "Using structured generation with the choices: "
+                f"{structured_outputs.choice!r}.",
+                level=logging.DEBUG,
+            )
+        else:
+            structured_outputs = None
+            log_once(
+                "Not using structured generation as the dataset does not require it.",
+                level=logging.DEBUG,
+            )
         # Define the parameters used for vLLM generation
         max_tokens: int = (
@@ -371,19 +479,21 @@ class VLLMModel(HuggingFaceEncoderModel):
         )
         sampling_params = SamplingParams(
             max_tokens=max_tokens,
-            logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
+            logprobs=MAX_VLLM_LOGPROBS
+            if self.buffer["first_label_token_mapping"]
+            else None,
             temperature=0.0,
             stop=[stop_token for stop_token in stop_tokens if stop_token],
-            logits_processors=[logits_processor] if logits_processor else None,
+            structured_outputs=structured_outputs,
         )
         # If any of the prompts are empty then we need to replace them with a BOS token
         # so that the vLLM model can generate from them
-        prompts: list[str] = inputs["text"]
+        prompts: c.Sequence[str] = inputs["text"]
         if any(len(prompt) == 0 for prompt in prompts):
-            logger.debug("Found empty prompts, replacing with BOS token.")
+            log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
             prompts = [
-                prompt if len(prompt) > 0 else str(self._tokenizer.bos_token)
+                prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
                 for prompt in prompts
             ]
@@ -391,10 +501,8 @@ class VLLMModel(HuggingFaceEncoderModel):
         labels_to_be_generated = list(self.dataset_config.prompt_label_mapping.values())
         if len(labels_to_be_generated) == 0:
             labels_to_be_generated = ["negative", "positive"]
-        if not self.buffer.get(
-            "instruction_model", False
-        ) and should_prompts_be_stripped(
-            labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
+        if self.generative_type == GenerativeType.BASE and should_prompts_be_stripped(
+            labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
         ):
             log_once(
                 f"Stripping prompts for model {self.model_config.model_id!r}.",
@@ -402,21 +510,35 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
             prompts = [prompt.strip() for prompt in prompts]
+        # Truncate the prompts if needed, but only if it's not a reasoning model
+        if self.generative_type != GenerativeType.REASONING:
+            max_tokens_per_prompt = (
+                min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH) - max_tokens
+            )
+            tokenized_prompts = self._tokeniser(
+                text=list(prompts), truncation=True, max_length=max_tokens_per_prompt
+            )
+            prompts = self._tokeniser.batch_decode(
+                sequences=tokenized_prompts.input_ids, skip_special_tokens=True
+            )
         # Generate sequences using vLLM
         input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
         num_attempts = 3
+        truncation_attempts = 1
         for _ in range(num_attempts):
             try:
                 raw_outputs = self._model.generate(
                     prompts=prompts,
                     sampling_params=sampling_params,
-                    use_tqdm=False if input_is_a_test else get_pbar_without_leave,
+                    use_tqdm=False if input_is_a_test else get_pbar,
                     lora_request=self.buffer.get("lora_request"),
                 )
                 break
             except TypeError as e:
-                logger.debug(
-                    f"Encountered error during vLLM generation: {str(e)}. Retrying..."
+                log(
+                    f"Encountered error during vLLM generation: {str(e)}. Retrying...",
+                    level=logging.DEBUG,
                 )
                 sleep(1)
             except ValueError as e:
@@ -428,26 +550,34 @@ class VLLMModel(HuggingFaceEncoderModel):
                     re.search(pattern, str(e), flags=re.IGNORECASE) is not None
                     for pattern in truncate_error_messages
                 ):
-                    logger.info(
-                        "Prompts are too long, so truncating them and trying again..."
+                    log(
+                        "Prompts are too long, so truncating them and trying again...",
+                        level=logging.WARNING,
                     )
-                    logger.debug(f"The error message was: {str(e)}")
-                    tokenized_prompts = self._tokenizer(
+                    log(f"The error message was: {str(e)}", level=logging.DEBUG)
+                    # If we have already tried truncating the prompts a few times, then
+                    # we truncate a bit more aggressively
+                    extra_truncation = 50 * truncation_attempts
+                    truncation_attempts += 1
+                    tokenized_prompts = self._tokeniser(
                         text=prompts,
                         truncation=True,
                         max_length=max(
-                            min(self._tokenizer.model_max_length, MAX_CONTEXT_LENGTH)
-                            - max_tokens,
+                            min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
+                            - max_tokens
+                            - extra_truncation,
                             0,
                         ),
                     )
-                    prompts = self._tokenizer.batch_decode(
+                    prompts = self._tokeniser.batch_decode(
                         sequences=tokenized_prompts.input_ids, skip_special_tokens=True
                     )
                 else:
                     raise InvalidBenchmark(
                         f"An error occurred during vLLM generation: {str(e)}"
-                    )
+                    ) from e
         else:
             raise InvalidBenchmark(
                 f"Could not generate sequences after {num_attempts} attempts."
@@ -467,34 +597,73 @@ class VLLMModel(HuggingFaceEncoderModel):
                     f"{num_extra_outputs!r} extra outputs."
                 )
             else:
-                logger.debug(
+                log(
                     f"Filtered out {num_extra_outputs:,} extra outputs from the model, "
                     "which occured as we interupted the generation when we truncated "
-                    "the prompts."
+                    "the prompts.",
+                    level=logging.DEBUG,
                 )
-        # Parse the raw model outputs
-        completion_ids: list[list[int]] = [
-            output.outputs[0].token_ids for output in raw_outputs
+        # Parse the raw model outputs. We keep the special tokens for now, as we need
+        # them to potentially remove reasoning content and stop tokens
+        completion_ids: c.Sequence[c.Sequence[int]] = [
+            list(output.outputs[0].token_ids) for output in raw_outputs
         ]
-        completions = self._tokenizer.batch_decode(
+        completions = self._tokeniser.batch_decode(
             sequences=[
                 torch.LongTensor(completion_id) for completion_id in completion_ids
-            ]
+            ],
+            skip_special_tokens=False,
         )
-        if self.end_of_reasoning_token is not None:
-            completions = [
-                completion.split(self.end_of_reasoning_token)[-1]
-                for completion in completions
-            ]
+        if (
+            self.end_of_reasoning_token is not None
+            and self.generative_type == GenerativeType.REASONING
+        ):
+            num_samples_without_eor_token = 0
+            for idx in range(len(completions)):
+                if (
+                    isinstance(self.end_of_reasoning_token, str)
+                    and self.end_of_reasoning_token in completions[idx]
+                ):
+                    completions[idx] = completions[idx].split(
+                        self.end_of_reasoning_token
+                    )[-1]
+                elif isinstance(
+                    self.end_of_reasoning_token, re.Pattern
+                ) and self.end_of_reasoning_token.search(completions[idx]):
+                    completions[idx] = self.end_of_reasoning_token.split(
+                        completions[idx]
+                    )[-1]
+                else:
+                    num_samples_without_eor_token += 1
+                    completions[idx] = ""
+            if num_samples_without_eor_token > 0:
+                log_once(
+                    f"The model {self.model_config.model_id!r} is a reasoning "
+                    "model, but the generated output did not contain the end of "
+                    f"reasoning token ({self.end_of_reasoning_token!r}) in "
+                    f"{num_samples_without_eor_token:,}/{len(completions):,} of "
+                    "the samples. Using an empty string for all these samples "
+                    "instead.",
+                    level=(
+                        logging.WARNING
+                        if num_samples_without_eor_token / len(completions) > 0.5
+                        else logging.DEBUG
+                    ),
+                )
         stop_token_pattern = re.compile(
             "|".join(re.escape(stop_token) for stop_token in stop_tokens)
         )
         completions = [
-            re.split(pattern=stop_token_pattern, string=completion)[0]
+            re.split(pattern=stop_token_pattern, string=completion)[0].strip()
             for completion in completions
         ]
-        completions = [completion.strip() for completion in completions]
+        # Remove all the special tokens from the completions, if any are present
+        completion_ids = self._tokeniser(text=completions).input_ids
+        completions = self._tokeniser.batch_decode(
+            sequences=completion_ids, skip_special_tokens=True
+        )
         # Sanity check
         if len(completions) != len(prompts):
@@ -504,13 +673,13 @@ class VLLMModel(HuggingFaceEncoderModel):
         # Add logprobs scores to the output
         if self.buffer["first_label_token_mapping"]:
-            scores: list[list[list[tuple[str, float]]]] = [
+            scores: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]] = [
                 [
                     [
-                        (obj.decoded_token, obj.logprob)
+                        (obj.decoded_token or "", obj.logprob)
                         for obj in token_logprobs_dict.values()
                     ]
-                    for token_logprobs_dict in raw_output.outputs[0].logprobs
+                    for token_logprobs_dict in raw_output.outputs[0].logprobs or list()
                 ]
                 for raw_output in raw_outputs
             ]
@@ -543,11 +712,18 @@ class VLLMModel(HuggingFaceEncoderModel):
         if using_api:
             return False
-        model_id, revision = (
-            model_id.split("@") if "@" in model_id else (model_id, "main")
-        )
+        model_id_components = split_model_id(model_id=model_id)
+        model_id = model_id_components.model_id
+        revision = model_id_components.revision
         model_info = get_model_repo_info(
-            model_id=model_id, revision=revision, benchmark_config=benchmark_config
+            model_id=model_id,
+            revision=revision,
+            api_key=benchmark_config.api_key,
+            cache_dir=benchmark_config.cache_dir,
+            trust_remote_code=benchmark_config.trust_remote_code,
+            requires_safetensors=benchmark_config.requires_safetensors,
+            run_with_cli=benchmark_config.run_with_cli,
         )
         return (
             model_info is not None
@@ -569,11 +745,15 @@ class VLLMModel(HuggingFaceEncoderModel):
         Returns:
             The model configuration.
         """
-        model_id, revision = (
-            model_id.split("@") if "@" in model_id else (model_id, "main")
-        )
+        model_id_components = split_model_id(model_id=model_id)
         model_info = get_model_repo_info(
-            model_id=model_id, revision=revision, benchmark_config=benchmark_config
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            api_key=benchmark_config.api_key,
+            cache_dir=benchmark_config.cache_dir,
+            trust_remote_code=benchmark_config.trust_remote_code,
+            requires_safetensors=benchmark_config.requires_safetensors,
+            run_with_cli=benchmark_config.run_with_cli,
         )
         if model_info is None:
             raise InvalidModel(f"The model {model_id!r} could not be found.")
@@ -582,8 +762,9 @@ class VLLMModel(HuggingFaceEncoderModel):
         language_codes = list(language_mapping.keys())
         model_config = ModelConfig(
-            model_id=model_id,
-            revision=revision,
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            param=model_id_components.param,
             task=model_info.pipeline_tag,
             languages=[
                 language_mapping[tag]
@@ -603,7 +784,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         return model_config
     @property
-    def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
+    def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.
         Returns:
@@ -625,10 +806,10 @@ class VLLMModel(HuggingFaceEncoderModel):
         )
-def load_model_and_tokenizer(
+def load_model_and_tokeniser(
     model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
 ) -> tuple["LLM", "PreTrainedTokenizer"]:
-    """Load the model and tokenizer.
+    """Load the model and tokeniser.
     Args:
         model_config:
@@ -637,7 +818,7 @@ def load_model_and_tokenizer(
             The benchmark configuration.
     Returns:
-        A pair (model, tokenizer), with the loaded model and tokenizer
+        A pair (model, tokeniser), with the loaded model and tokeniser
     """
     # Prefer base model ID if the model is an adapter - the adapter will be added on
     # during inference in this case
@@ -649,8 +830,8 @@ def load_model_and_tokenizer(
     hf_model_config = load_hf_model_config(
         model_id=model_id,
         num_labels=0,
-        id2label=dict(),
-        label2id=dict(),
+        id2label=HashableDict(),
+        label2id=HashableDict(),
         revision=revision,
         model_cache_dir=model_config.model_cache_dir,
         api_key=benchmark_config.api_key,
@@ -675,46 +856,55 @@ def load_model_and_tokenizer(
     dtype: str | torch.dtype = "auto"
     # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
-    if hf_model_config.torch_dtype == torch.float32:
+    if hf_model_config.dtype == torch.float32:
         if torch.cuda.is_bf16_supported():
-            logger.info(
+            log(
                 "You are loading a model with dtype FP32, which we will convert to "
                 "BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
-                "GPU."
+                "GPU.",
+                level=logging.WARNING,
             )
             dtype = torch.bfloat16
         else:
-            logger.info(
+            log(
                 "You are loading a model with dtype FP32, which we will convert to "
                 "FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
-                "your GPU."
+                "your GPU.",
+                level=logging.WARNING,
             )
             dtype = torch.float16
-    # If the model is a quantized model, we need to set the dtype to float16
-    if quantization is not None and hf_model_config.torch_dtype != torch.float16:
-        logger.info(
+    # If the model is a quantized model, we might need to change the dtype
+    if quantization == "mxfp4" and hf_model_config.dtype is None:
+        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        log(
+            "You are loading a quantized model where `dtype` has not been set. "
+            f"Setting dtype to {dtype!r}.",
+            level=logging.DEBUG,
+        )
+    elif quantization is not None and hf_model_config.dtype != torch.float16:
+        log(
             "You are loading a quantized model with dtype "
-            f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting "
-            "dtype to float16 instead."
+            f"{hf_model_config.dtype}, which vLLM does not support. Setting "
+            "dtype to float16 instead.",
+            level=logging.WARNING,
         )
         dtype = torch.float16
     # If the model is a bf16 model, we need to check the CUDA compute capability
-    if hf_model_config.torch_dtype == torch.bfloat16:
+    if hf_model_config.dtype == torch.bfloat16:
         min_cuda_compute_capability = get_min_cuda_compute_capability()
         required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
         if min_cuda_compute_capability is not None:
             if min_cuda_compute_capability < required_capability:
-                logger.info(
-                    "You are loading a model with "
-                    f"dtype {hf_model_config.torch_dtype}, "
-                    "which vLLM only supports for CUDA devices with"
-                    f"CUDA compute capability >={required_capability}. "
-                    "You are using one or more devices with "
-                    f"compute capability {min_cuda_compute_capability}. "
-                    "Setting dtype to float16 instead."
+                log(
+                    f"You are loading a model with dtype {hf_model_config.dtype}, "
+                    "which vLLM only supports for CUDA devices with CUDA compute "
+                    f"capability >={required_capability}. You are using one or more "
+                    f"devices with compute capability {min_cuda_compute_capability}. "
+                    "Setting dtype to float16 instead.",
+                    level=logging.WARNING,
                 )
                 dtype = torch.float16
@@ -741,31 +931,40 @@ def load_model_and_tokenizer(
     else:
         true_max_model_len = MAX_CONTEXT_LENGTH
-    tokenizer = load_tokenizer(
+    tokeniser = load_tokeniser(
         model_id=model_config.model_id,
         revision=model_config.revision,
         adapter_base_model_id=model_config.adapter_base_model_id,
         trust_remote_code=benchmark_config.trust_remote_code,
         model_max_length=true_max_model_len,
-        model_cache_dir=model_config.model_cache_dir,
-        token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+        model_config=model_config,
+        token=get_hf_token(api_key=benchmark_config.api_key),
+    )
+    vllm_tokenisation_params = get_vllm_tokenisation_params(
+        tokeniser=tokeniser, model_config=model_config
     )
     clear_vllm()
     try:
         model = LLM(
-            model=model_id,
-            tokenizer=model_id,
+            model=(
+                model_id
+                if internet_connection_available()
+                else resolve_model_path(download_dir=download_dir)
+            ),
+            tokenizer=(
+                model_id
+                if internet_connection_available()
+                else resolve_model_path(download_dir=download_dir)
+            ),
             gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
             max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
             download_dir=download_dir,
             trust_remote_code=benchmark_config.trust_remote_code,
             revision=revision,
             seed=4242,
-            distributed_executor_backend=(
-                "ray" if torch.cuda.device_count() > 1 else "mp"
-            ),
+            distributed_executor_backend="mp",
             tensor_parallel_size=torch.cuda.device_count(),
             disable_custom_all_reduce=True,
             quantization=quantization,
@@ -776,38 +975,65 @@ def load_model_and_tokenizer(
             enable_prefix_caching=False,
             enable_lora=model_config.adapter_base_model_id is not None,
             max_lora_rank=256,
+            **vllm_tokenisation_params,
         )
     except (RuntimeError, ValueError, OSError) as e:
         if "awaiting a review from the repo authors" in str(e):
             raise InvalidModel(
                 f"The model {model_id!r} is awaiting a review from the repository "
                 "authors. Please try again later."
-            )
+            ) from e
         elif "trust_remote_code" in str(e):
             raise InvalidModel(
                 f"Loading the model {model_id!r} needs to trust remote code. "
                 "If you trust the suppliers of this model, then you can enable "
                 "this by setting the `--trust-remote-code` flag."
+            ) from e
+        elif "See stack trace for root cause." in str(
+            e
+        ) or "See root cause above." in str(e):
+            msg = (
+                f"The model {model_id!r} could not be loaded, but vLLM did not "
+                "mention exactly what happened. "
+            )
+            msg += (
+                (
+                    "Since you're running in verbose mode, you might see a descriptive "
+                    "error above already. Note however that if the error message urges "
+                    "you to set the environment variable `VLLM_ATTENTION_BACKEND` to "
+                    "'FLEX_ATTENTION', please try setting it to 'FLASH_ATTN' first, as "
+                    "that often solves the issue, whereas 'FLEX_ATTENTION' usually "
+                    "doesn't. If you don't see any descriptive error above, then you "
+                    "can try "
+                )
+                if benchmark_config.verbose
+                else "Try "
+            )
+            msg += (
+                "re-running the benchmark with the environment variable `FULL_LOG` "
+                "set to `1` to see the full stack trace. E.g., "
+                f"`FULL_LOG=1 euroeval --model {model_id}`."
             )
+            raise InvalidModel(msg) from e
         raise InvalidModel(
             f"The model {model_id!r} could not be loaded. The error was {e!r}."
-        )
+        ) from e
     model.config = hf_model_config
-    return model, tokenizer
+    return model, tokeniser
-def load_tokenizer(
+def load_tokeniser(
     model_id: str,
     revision: str,
     adapter_base_model_id: str | None,
     trust_remote_code: bool,
     model_max_length: int,
-    model_cache_dir: str,
+    model_config: "ModelConfig",
     token: str | bool,
 ) -> "PreTrainedTokenizer":
-    """Load the tokenizer.
+    """Load the tokeniser.
     Args:
         model_id:
@@ -821,64 +1047,97 @@ def load_tokenizer(
             Whether to trust remote code.
         model_max_length:
             The maximum length of the model.
-        model_cache_dir:
-            The cache directory for the model.
+        model_config:
+            The model configuration.
         token:
             The Hugging Face API token.
     Returns:
-        The loaded tokenizer.
+        The loaded tokeniser.
     """
     revision = revision if adapter_base_model_id is None else "main"
     config = AutoConfig.from_pretrained(
         adapter_base_model_id or model_id,
         revision=revision,
-        cache_dir=model_cache_dir,
+        cache_dir=model_config.model_cache_dir,
         token=token,
         trust_remote_code=trust_remote_code,
+        local_files_only=not internet_connection_available(),
     )
     num_retries = 5
     for _ in range(num_retries):
         try:
-            tokenizer = AutoTokenizer.from_pretrained(
+            # Mistral instruction-tuned models need a custom tokeniser
+            if model_id.startswith("mistralai/") and "base" not in model_id.lower():
+                tokeniser = MistralCommonTokenizer.from_pretrained(
+                    model_id,
+                    padding_side="left",
+                    truncation_side="left",
+                    model_max_length=model_max_length,
+                    token=token,
+                )
+                break
+            tokeniser = AutoTokenizer.from_pretrained(
                 model_id,
-                use_fast=True,
+                revision=revision,
+                use_fast=False if model_config.param == "slow-tokenizer" else True,
                 verbose=False,
                 trust_remote_code=trust_remote_code,
                 padding_side="left",
                 truncation_side="left",
                 model_max_length=model_max_length,
+                cache_dir=model_config.model_cache_dir,
                 config=config,
                 token=token,
+                local_files_only=not internet_connection_available(),
             )
             break
         except (json.JSONDecodeError, OSError, TypeError) as e:
             if adapter_base_model_id is None or model_id == adapter_base_model_id:
                 raise InvalidModel(
-                    f"Could not load tokenizer for model {model_id!r}. The error was "
+                    f"Could not load tokeniser for model {model_id!r}. The error was "
                     f"{str(e)}."
-                )
-            logger.debug(
-                f"Could not load tokenizer for {model_id!r}. Falling back to "
-                f"{adapter_base_model_id!r}."
+                ) from e
+            log(
+                f"Could not load tokeniser for {model_id!r}. Falling back to "
+                f"{adapter_base_model_id!r}.",
+                level=logging.DEBUG,
             )
             model_id = adapter_base_model_id
         except (TimeoutError, RequestError):
-            logger.info(f"Couldn't load tokenizer for {model_id!r}. Retrying.")
+            log(
+                f"Couldn't load tokeniser for {model_id!r}. Retrying.",
+                level=logging.WARNING,
+            )
             sleep(5)
             continue
+        except (KeyError, ValueError) as e:
+            if "mistral" in str(e).lower():
+                tokeniser = MistralCommonTokenizer.from_pretrained(
+                    model_id,
+                    padding_side="left",
+                    truncation_side="left",
+                    model_max_length=model_max_length,
+                    token=token,
+                )
+                break
+            raise InvalidModel(
+                f"Could not load tokeniser for model {model_id!r}. The error was "
+                f"{str(e)}."
+            ) from e
     else:
         raise InvalidModel(
-            f"Could not load tokenizer for model {model_id!r} after {num_retries} "
+            f"Could not load tokeniser for model {model_id!r} after {num_retries} "
             "attempts."
         )
     # Ensure that BOS, EOS and PAD tokens are set
-    tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
-    tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
-    tokenizer.pad_token, tokenizer.pad_token_id = get_pad_token(tokenizer=tokenizer)
+    if not isinstance(tokeniser, MistralCommonTokenizer):
+        tokeniser.bos_token, tokeniser.bos_token_id = get_bos_token(tokeniser=tokeniser)
+        tokeniser.eos_token, tokeniser.eos_token_id = get_eos_token(tokeniser=tokeniser)
+        tokeniser.pad_token, tokeniser.pad_token_id = get_pad_token(tokeniser=tokeniser)
-    return tokenizer
+    return tokeniser
 def clear_vllm() -> None:
@@ -886,80 +1145,93 @@ def clear_vllm() -> None:
     with contextlib.suppress(ValueError):
         destroy_model_parallel()
         destroy_distributed_environment()
-    if ray.is_initialized():
-        ray.shutdown()
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
-    if ray.is_initialized():
-        ray.shutdown()
     clear_memory()
 def get_end_of_reasoning_token(
-    model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
-) -> str | None:
+    model: "LLM", tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
+) -> str | re.Pattern | None:
     """Get the end-of-reasoning token for a generative model.
     Args:
         model:
             The vLLM model.
-        tokenizer:
-            The tokenizer.
-        model_id:
-            The model ID.
+        tokeniser:
+            The tokeniser.
+        model_config:
+            The model configuration.
     Returns:
         The end of reasoning token, or None if it could not be found.
     """
+    model_id = model_config.model_id
     # Create a prompt to check if the model uses the reasoning tokens
     prompt = "What is your name?"
-    if tokenizer.chat_template is not None:
-        templated_prompt = tokenizer.apply_chat_template(
+    if has_chat_template(tokeniser=tokeniser):
+        extra_kwargs = dict()
+        if model_config.param in {"thinking", "no-thinking"}:
+            extra_kwargs["enable_thinking"] = model_config.param == "thinking"
+        templated_prompt = apply_chat_template(
             conversation=[dict(role="user", content=prompt)],
+            tokeniser=tokeniser,
+            tokenise=False,
             add_generation_prompt=True,
-            tokenize=False,
+            **extra_kwargs,
         )
         assert isinstance(templated_prompt, str)
         prompt = templated_prompt
     # Check that the beginning-of-reasoning token is actually used by the model
-    completion = (
-        model.generate(
-            prompts=[prompt],
-            sampling_params=SamplingParams(max_tokens=10),
-            use_tqdm=False,
-        )[0]
-        .outputs[0]
-        .text
-    )
+    output = model.generate(
+        prompts=[prompt], sampling_params=SamplingParams(max_tokens=10), use_tqdm=False
+    )[0]
+    completion = tokeniser.decode(token_ids=output.outputs[0].token_ids)
     bor_reasoning_matches = [
         (bor_token, eor_token)
         for bor_token, eor_token in REASONING_TOKENS
-        if bor_token in prompt or bor_token in completion
+        if (
+            (
+                isinstance(bor_token, str)
+                and (bor_token in prompt or bor_token in completion)
+            )
+            or (
+                isinstance(bor_token, re.Pattern)
+                and (
+                    bor_token.search(prompt) is not None
+                    or bor_token.search(completion) is not None
+                )
+            )
+        )
     ]
     if not bor_reasoning_matches:
         log_once(
             f"The model {model_id!r} did not generate any beginning-of-reasoning "
-            "tokens in the prompt or the completion. Assuming the model is not "
-            "a reasoning model.",
-            level=logging.INFO,
+            "tokens in the prompt or the completion. Assuming the model is not a "
+            "reasoning model.",
+            level=logging.DEBUG,
         )
         return None
-    # Check that the beginning-of-reasoning token is actually used by the model
-    completion = (
-        model.generate(
-            prompts=[prompt],
-            sampling_params=SamplingParams(max_tokens=REASONING_MAX_TOKENS),
-            use_tqdm=False,
-        )[0]
-        .outputs[0]
-        .text
-    )
+    # Check that the end-of-reasoning token is actually used by the model
+    output = model.generate(
+        prompts=[prompt],
+        sampling_params=SamplingParams(max_tokens=REASONING_MAX_TOKENS),
+        use_tqdm=False,
+    )[0]
+    completion = tokeniser.decode(token_ids=output.outputs[0].token_ids)
     eor_reasoning_matches = [
         (bor_token, eor_token)
         for bor_token, eor_token in bor_reasoning_matches
-        if eor_token in completion
+        if (
+            (isinstance(eor_token, str) and eor_token in completion)
+            or (
+                isinstance(eor_token, re.Pattern)
+                and eor_token.search(completion) is not None
+            )
+        )
     ]
     if not eor_reasoning_matches:
         log_once(
@@ -968,7 +1240,7 @@ def get_end_of_reasoning_token(
             "the beginning-of-reasoning tokens "
             f"{[bor_token for bor_token, _ in bor_reasoning_matches]!r}. "
             "This is probably not correct, so please report this issue.",
-            level=logging.INFO,
+            level=logging.WARNING,
         )
         return None
@@ -978,14 +1250,21 @@ def get_end_of_reasoning_token(
             f"model {model_id!r}. Using {eor_reasoning_matches[0]!r} as "
             "the reasoning token. If this is not the correct reasoning token, "
             "please report this issue.",
-            level=logging.INFO,
+            level=logging.WARNING,
         )
     bor_token, eor_token = eor_reasoning_matches[0]
+    bor_token_logging: str = (
+        bor_token if isinstance(bor_token, str) else bor_token.pattern
+    )
+    eor_token_logging: str = (
+        eor_token if isinstance(eor_token, str) else eor_token.pattern
+    )
     log_once(
-        f"Detected beginning-of-reasoning token {bor_token!r} and end-of-reasoning "
-        f"token {eor_token!r} for model {model_id!r}.",
-        level=logging.INFO,
+        f"Detected beginning-of-reasoning token {bor_token_logging!r} and "
+        f"end-of-reasoning token {eor_token_logging!r} for model {model_id!r}.",
+        level=logging.DEBUG,
     )
     return eor_token
@@ -993,22 +1272,21 @@ def get_end_of_reasoning_token(
 def get_custom_stop_tokens(
     model: "LLM",
-    tokenizer: "PreTrainedTokenizer",
+    tokeniser: "PreTrainedTokenizer",
     model_id: str,
-    is_reasoning_model: bool,
+    generative_type: GenerativeType | None,
 ) -> list[str]:
     """Get the stop tokens for a generative model.
     Args:
         model:
             The vLLM model.
-        tokenizer:
-            The tokenizer.
+        tokeniser:
+            The tokeniser.
         model_id:
             The model ID.
-        is_reasoning_model:
-            Whether the model is a reasoning model. This is used to determine the number
-            of generated tokens to allow before stopping the generation.
+        generative_type:
+            The generative type of the model.
     Returns:
         A list of stop tokens.
@@ -1016,25 +1294,26 @@ def get_custom_stop_tokens(
     candidate_stop_tokens = CUSTOM_STOP_TOKENS
     prompt = "Hello"
-    if tokenizer.chat_template is not None:
-        templated_prompt = tokenizer.apply_chat_template(
+    if has_chat_template(tokeniser=tokeniser):
+        templated_prompt = apply_chat_template(
             conversation=[dict(role="user", content=prompt)],
+            tokeniser=tokeniser,
+            tokenise=False,
             add_generation_prompt=True,
-            tokenize=False,
+            enable_thinking=generative_type == GenerativeType.REASONING,
         )
         assert isinstance(templated_prompt, str)
         prompt = templated_prompt
-    max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
-    completion = (
-        model.generate(
-            prompts=[prompt],
-            sampling_params=SamplingParams(max_tokens=max_tokens, temperature=0.0),
-            use_tqdm=False,
-        )[0]
-        .outputs[0]
-        .text
+    max_tokens = (
+        REASONING_MAX_TOKENS if generative_type == GenerativeType.REASONING else 10
     )
+    output = model.generate(
+        prompts=[prompt],
+        sampling_params=SamplingParams(max_tokens=max_tokens, temperature=0.0),
+        use_tqdm=False,
+    )[0]
+    completion = tokeniser.decode(token_ids=output.outputs[0].token_ids)
     stop_tokens = [
         stop_token
@@ -1042,27 +1321,50 @@ def get_custom_stop_tokens(
         if stop_token in prompt or stop_token in completion
     ]
     if stop_tokens:
-        logger.debug(
+        log(
             f"Found the following custom stop tokens for model {model_id!r}: "
-            f"{stop_tokens}."
+            f"{stop_tokens}.",
+            level=logging.DEBUG,
         )
     else:
-        logger.debug(f"Found no custom stop tokens for model {model_id!r}.")
+        log(f"Found no custom stop tokens for model {model_id!r}.", level=logging.DEBUG)
     return stop_tokens
-def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
-    """Get a progress bar for vLLM which disappears after completion.
+def get_vllm_tokenisation_params(
+    tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
+) -> dict[str, t.Any]:
+    """Get the tokenisation parameters for vLLM.
     Args:
-        *tqdm_args:
-            Positional arguments to pass to tqdm.
-        **tqdm_kwargs:
-            Additional keyword arguments to pass to tqdm.
+        tokeniser:
+            The tokeniser.
+        model_config:
+            The model configuration.
     Returns:
-        A tqdm progress bar.
+        A dictionary of tokenisation parameters to pass to vLLM.
     """
-    tqdm_kwargs.pop("leave", None)  # Remove the 'leave' key if it exists
-    return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        tokeniser_mode = "mistral"
+    elif model_config.param == "slow-tokenizer":
+        tokeniser_mode = "slow"
+    else:
+        tokeniser_mode = "auto"
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        config_format = "mistral"
+    else:
+        config_format = "auto"
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        load_format = "mistral"
+    else:
+        load_format = "auto"
+    return dict(
+        tokenizer_mode=tokeniser_mode,
+        config_format=config_format,
+        load_format=load_format,
+    )

EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl