PyPI - EuroEval - Versions diffs - 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl - Mend

EuroEval 16.2.2py3-none-any.whl → 16.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (65) hide show

euroeval/__init__.py +7 -4
euroeval/benchmark_config_factory.py +0 -4
euroeval/benchmark_modules/base.py +3 -16
euroeval/benchmark_modules/fresh.py +5 -2
euroeval/benchmark_modules/hf.py +107 -66
euroeval/benchmark_modules/litellm.py +103 -55
euroeval/benchmark_modules/vllm.py +155 -82
euroeval/benchmarker.py +184 -129
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +1 -1
euroeval/constants.py +9 -0
euroeval/data_loading.py +14 -11
euroeval/data_models.py +12 -4
euroeval/dataset_configs/__init__.py +3 -0
euroeval/dataset_configs/czech.py +79 -0
euroeval/dataset_configs/danish.py +10 -13
euroeval/dataset_configs/dutch.py +0 -3
euroeval/dataset_configs/english.py +0 -3
euroeval/dataset_configs/estonian.py +11 -1
euroeval/dataset_configs/finnish.py +0 -3
euroeval/dataset_configs/french.py +0 -3
euroeval/dataset_configs/german.py +0 -3
euroeval/dataset_configs/italian.py +0 -3
euroeval/dataset_configs/latvian.py +2 -4
euroeval/dataset_configs/lithuanian.py +68 -0
euroeval/dataset_configs/norwegian.py +0 -3
euroeval/dataset_configs/polish.py +0 -3
euroeval/dataset_configs/portuguese.py +0 -3
euroeval/dataset_configs/slovak.py +60 -0
euroeval/dataset_configs/spanish.py +0 -3
euroeval/dataset_configs/swedish.py +10 -15
euroeval/finetuning.py +21 -15
euroeval/generation.py +10 -10
euroeval/generation_utils.py +2 -3
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +10 -6
euroeval/metrics/llm_as_a_judge.py +5 -3
euroeval/metrics/pipeline.py +22 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +11 -14
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/linguistic_acceptability.py +30 -3
euroeval/prompt_templates/multiple_choice.py +34 -1
euroeval/prompt_templates/named_entity_recognition.py +71 -11
euroeval/prompt_templates/reading_comprehension.py +41 -3
euroeval/prompt_templates/sentiment_classification.py +34 -1
euroeval/prompt_templates/summarization.py +26 -6
euroeval/scores.py +7 -7
euroeval/speed_benchmark.py +3 -5
euroeval/task_group_utils/multiple_choice_classification.py +0 -3
euroeval/task_group_utils/question_answering.py +0 -3
euroeval/task_group_utils/sequence_classification.py +43 -31
euroeval/task_group_utils/text_to_text.py +17 -8
euroeval/task_group_utils/token_classification.py +10 -9
euroeval/tokenisation_utils.py +22 -20
euroeval/utils.py +30 -147
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
euroeval-16.4.0.dist-info/RECORD +75 -0
euroeval-16.2.2.dist-info/RECORD +0 -70
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -14,10 +14,9 @@ from time import sleep
 import torch
 from huggingface_hub import snapshot_download
 from pydantic import conlist, create_model
-from tqdm.auto import tqdm
-from transformers import MistralCommonTokenizer
 from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers.tokenization_mistral_common import MistralCommonTokenizer
 from urllib3.exceptions import RequestError
 from ..constants import (
@@ -30,7 +29,7 @@ from ..constants import (
     REASONING_TOKENS,
     VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
 )
-from ..data_models import GenerativeModelOutput, ModelConfig
+from ..data_models import GenerativeModelOutput, HashableDict, ModelConfig
 from ..enums import (
     BatchingPreference,
     GenerativeType,
@@ -50,6 +49,7 @@ from ..generation_utils import (
     raise_if_wrong_params,
 )
 from ..languages import get_all_languages
+from ..logging_utils import get_pbar, log, log_once, no_terminal_output
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
@@ -73,7 +73,6 @@ from ..utils import (
     get_hf_token,
     get_min_cuda_compute_capability,
     internet_connection_available,
-    log_once,
     resolve_model_path,
     split_model_id,
 )
@@ -86,7 +85,7 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
         destroy_model_parallel,
     )
     from vllm.lora.request import LoRARequest
-    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.sampling_params import StructuredOutputsParams
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -95,8 +94,6 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig, Task
-logger = logging.getLogger("euroeval")
 class VLLMModel(HuggingFaceEncoderModel):
     """A generative model using the vLLM inference framework."""
@@ -104,7 +101,7 @@ class VLLMModel(HuggingFaceEncoderModel):
     fresh_model = False
     batching_preference = BatchingPreference.ALL_AT_ONCE
     high_priority = True
-    allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
+    allowed_params = {re.compile(r".*"): ["thinking", "no-thinking", "slow-tokenizer"]}
     def __init__(
         self,
@@ -132,9 +129,10 @@ class VLLMModel(HuggingFaceEncoderModel):
             model_config=model_config, allowed_params=self.allowed_params
         )
-        model, tokeniser = load_model_and_tokeniser(
-            model_config=model_config, benchmark_config=benchmark_config
-        )
+        with no_terminal_output(disable=benchmark_config.verbose):
+            model, tokeniser = load_model_and_tokeniser(
+                model_config=model_config, benchmark_config=benchmark_config
+            )
         self._model: "LLM" = model
         self._tokeniser: "PreTrainedTokenizer" = tokeniser
@@ -245,6 +243,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                 return partial(
                     sequence_classification.extract_labels_from_generation,
                     dataset_config=self.dataset_config,
+                    model_config=self.model_config,
                     first_label_token_mapping=self.buffer["first_label_token_mapping"],
                 )
             case TaskGroup.TEXT_TO_TEXT:
@@ -394,10 +393,11 @@ class VLLMModel(HuggingFaceEncoderModel):
             self.dataset_config.task.uses_structured_output
             or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
         ) and self.generative_type == GenerativeType.REASONING:
-            guided_decoding = None
-            logger.debug(
+            structured_outputs = None
+            log(
                 "The dataset uses structured output, but we are not using it as the "
-                "model is a reasoning model."
+                "model is a reasoning model.",
+                level=logging.DEBUG,
             )
         elif self.dataset_config.task.uses_structured_output:
             ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
@@ -412,9 +412,11 @@ class VLLMModel(HuggingFaceEncoderModel):
                 f"{json.dumps(structured_generation_schema)}",
                 level=logging.DEBUG,
             )
-            guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
+            structured_outputs = StructuredOutputsParams(
+                json=structured_generation_schema
+            )
         elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
-            guided_decoding = GuidedDecodingParams(
+            structured_outputs = StructuredOutputsParams(
                 choice=[
                     self.dataset_config.prompt_label_mapping[label]
                     for label in self.dataset_config.labels
@@ -422,11 +424,11 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
             log_once(
                 "Using structured generation with the choices: "
-                f"{guided_decoding.choice!r}.",
+                f"{structured_outputs.choice!r}.",
                 level=logging.DEBUG,
             )
         else:
-            guided_decoding = None
+            structured_outputs = None
             log_once(
                 "Not using structured generation as the dataset does not require it.",
                 level=logging.DEBUG,
@@ -445,14 +447,14 @@ class VLLMModel(HuggingFaceEncoderModel):
             else None,
             temperature=0.0,
             stop=[stop_token for stop_token in stop_tokens if stop_token],
-            guided_decoding=guided_decoding,
+            structured_outputs=structured_outputs,
         )
         # If any of the prompts are empty then we need to replace them with a BOS token
         # so that the vLLM model can generate from them
         prompts: list[str] = inputs["text"]
         if any(len(prompt) == 0 for prompt in prompts):
-            logger.debug("Found empty prompts, replacing with BOS token.")
+            log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
             prompts = [
                 prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
                 for prompt in prompts
@@ -480,13 +482,14 @@ class VLLMModel(HuggingFaceEncoderModel):
                 raw_outputs = self._model.generate(
                     prompts=prompts,
                     sampling_params=sampling_params,
-                    use_tqdm=False if input_is_a_test else get_pbar_without_leave,
+                    use_tqdm=False if input_is_a_test else get_pbar,
                     lora_request=self.buffer.get("lora_request"),
                 )
                 break
             except TypeError as e:
-                logger.debug(
-                    f"Encountered error during vLLM generation: {str(e)}. Retrying..."
+                log(
+                    f"Encountered error during vLLM generation: {str(e)}. Retrying...",
+                    level=logging.DEBUG,
                 )
                 sleep(1)
             except ValueError as e:
@@ -498,10 +501,11 @@ class VLLMModel(HuggingFaceEncoderModel):
                     re.search(pattern, str(e), flags=re.IGNORECASE) is not None
                     for pattern in truncate_error_messages
                 ):
-                    logger.info(
-                        "Prompts are too long, so truncating them and trying again..."
+                    log(
+                        "Prompts are too long, so truncating them and trying again...",
+                        level=logging.WARNING,
                     )
-                    logger.debug(f"The error message was: {str(e)}")
+                    log(f"The error message was: {str(e)}", level=logging.DEBUG)
                     # If we have already tried truncating the prompts a few times, then
                     # we truncate a bit more aggressively
@@ -544,26 +548,49 @@ class VLLMModel(HuggingFaceEncoderModel):
                     f"{num_extra_outputs!r} extra outputs."
                 )
             else:
-                logger.debug(
+                log(
                     f"Filtered out {num_extra_outputs:,} extra outputs from the model, "
                     "which occured as we interupted the generation when we truncated "
-                    "the prompts."
+                    "the prompts.",
+                    level=logging.DEBUG,
                 )
         # Parse the raw model outputs
         completion_ids: list[list[int]] = [
-            output.outputs[0].token_ids for output in raw_outputs
+            list(output.outputs[0].token_ids) for output in raw_outputs
         ]
         completions = self._tokeniser.batch_decode(
             sequences=[
                 torch.LongTensor(completion_id) for completion_id in completion_ids
             ]
         )
-        if self.end_of_reasoning_token is not None:
-            completions = [
-                completion.split(self.end_of_reasoning_token)[-1]
-                for completion in completions
-            ]
+        if (
+            self.end_of_reasoning_token is not None
+            and self.generative_type == GenerativeType.REASONING
+        ):
+            num_samples_without_eor_token = 0
+            for idx in range(len(completions)):
+                if self.end_of_reasoning_token in completions[idx]:
+                    completions[idx] = completions[idx].split(
+                        self.end_of_reasoning_token
+                    )[-1]
+                else:
+                    num_samples_without_eor_token += 1
+                    completions[idx] = ""
+            if num_samples_without_eor_token > 0:
+                log_once(
+                    f"The model {self.model_config.model_id!r} is a reasoning "
+                    "model, but the generated output did not contain the end of "
+                    f"reasoning token ({self.end_of_reasoning_token!r}) in "
+                    f"{num_samples_without_eor_token:,}/{len(completions):,} of "
+                    "the samples. Using an empty string for all these samples "
+                    "instead.",
+                    level=(
+                        logging.WARNING
+                        if num_samples_without_eor_token / len(completions) > 0.5
+                        else logging.DEBUG
+                    ),
+                )
         stop_token_pattern = re.compile(
             "|".join(re.escape(stop_token) for stop_token in stop_tokens)
         )
@@ -584,10 +611,10 @@ class VLLMModel(HuggingFaceEncoderModel):
             scores: list[list[list[tuple[str, float]]]] = [
                 [
                     [
-                        (obj.decoded_token, obj.logprob)
+                        (obj.decoded_token or "", obj.logprob)
                         for obj in token_logprobs_dict.values()
                     ]
-                    for token_logprobs_dict in raw_output.outputs[0].logprobs
+                    for token_logprobs_dict in raw_output.outputs[0].logprobs or list()
                 ]
                 for raw_output in raw_outputs
             ]
@@ -625,7 +652,13 @@ class VLLMModel(HuggingFaceEncoderModel):
         revision = model_id_components.revision
         model_info = get_model_repo_info(
-            model_id=model_id, revision=revision, benchmark_config=benchmark_config
+            model_id=model_id,
+            revision=revision,
+            api_key=benchmark_config.api_key,
+            cache_dir=benchmark_config.cache_dir,
+            trust_remote_code=benchmark_config.trust_remote_code,
+            requires_safetensors=benchmark_config.requires_safetensors,
+            run_with_cli=benchmark_config.run_with_cli,
         )
         return (
             model_info is not None
@@ -651,7 +684,11 @@ class VLLMModel(HuggingFaceEncoderModel):
         model_info = get_model_repo_info(
             model_id=model_id_components.model_id,
             revision=model_id_components.revision,
-            benchmark_config=benchmark_config,
+            api_key=benchmark_config.api_key,
+            cache_dir=benchmark_config.cache_dir,
+            trust_remote_code=benchmark_config.trust_remote_code,
+            requires_safetensors=benchmark_config.requires_safetensors,
+            run_with_cli=benchmark_config.run_with_cli,
         )
         if model_info is None:
             raise InvalidModel(f"The model {model_id!r} could not be found.")
@@ -728,8 +765,8 @@ def load_model_and_tokeniser(
     hf_model_config = load_hf_model_config(
         model_id=model_id,
         num_labels=0,
-        id2label=dict(),
-        label2id=dict(),
+        id2label=HashableDict(),
+        label2id=HashableDict(),
         revision=revision,
         model_cache_dir=model_config.model_cache_dir,
         api_key=benchmark_config.api_key,
@@ -756,32 +793,36 @@ def load_model_and_tokeniser(
     # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
     if hf_model_config.dtype == torch.float32:
         if torch.cuda.is_bf16_supported():
-            logger.info(
+            log(
                 "You are loading a model with dtype FP32, which we will convert to "
                 "BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
-                "GPU."
+                "GPU.",
+                level=logging.WARNING,
             )
             dtype = torch.bfloat16
         else:
-            logger.info(
+            log(
                 "You are loading a model with dtype FP32, which we will convert to "
                 "FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
-                "your GPU."
+                "your GPU.",
+                level=logging.WARNING,
             )
             dtype = torch.float16
     # If the model is a quantized model, we might need to change the dtype
     if quantization == "mxfp4" and hf_model_config.dtype is None:
         dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
-        logger.debug(
+        log(
             "You are loading a quantized model where `dtype` has not been set. "
-            f"Setting dtype to {dtype!r}."
+            f"Setting dtype to {dtype!r}.",
+            level=logging.DEBUG,
         )
     elif quantization is not None and hf_model_config.dtype != torch.float16:
-        logger.info(
+        log(
             "You are loading a quantized model with dtype "
             f"{hf_model_config.dtype}, which vLLM does not support. Setting "
-            "dtype to float16 instead."
+            "dtype to float16 instead.",
+            level=logging.WARNING,
         )
         dtype = torch.float16
@@ -792,12 +833,13 @@ def load_model_and_tokeniser(
         if min_cuda_compute_capability is not None:
             if min_cuda_compute_capability < required_capability:
-                logger.info(
+                log(
                     f"You are loading a model with dtype {hf_model_config.dtype}, "
                     "which vLLM only supports for CUDA devices with CUDA compute "
                     f"capability >={required_capability}. You are using one or more "
                     f"devices with compute capability {min_cuda_compute_capability}. "
-                    "Setting dtype to float16 instead."
+                    "Setting dtype to float16 instead.",
+                    level=logging.WARNING,
                 )
                 dtype = torch.float16
@@ -830,9 +872,12 @@ def load_model_and_tokeniser(
         adapter_base_model_id=model_config.adapter_base_model_id,
         trust_remote_code=benchmark_config.trust_remote_code,
         model_max_length=true_max_model_len,
-        model_cache_dir=model_config.model_cache_dir,
+        model_config=model_config,
         token=get_hf_token(api_key=benchmark_config.api_key),
     )
+    vllm_tokenisation_params = get_vllm_tokenisation_params(
+        tokeniser=tokeniser, model_config=model_config
+    )
     clear_vllm()
@@ -865,16 +910,7 @@ def load_model_and_tokeniser(
             enable_prefix_caching=False,
             enable_lora=model_config.adapter_base_model_id is not None,
             max_lora_rank=256,
-            # Special arguments in case we are dealing with a Mistral model
-            tokenizer_mode="mistral"
-            if isinstance(tokeniser, MistralCommonTokenizer)
-            else "auto",
-            config_format="mistral"
-            if isinstance(tokeniser, MistralCommonTokenizer)
-            else "auto",
-            load_format="mistral"
-            if isinstance(tokeniser, MistralCommonTokenizer)
-            else "auto",
+            **vllm_tokenisation_params,
         )
     except (RuntimeError, ValueError, OSError) as e:
         if "awaiting a review from the repo authors" in str(e):
@@ -903,7 +939,7 @@ def load_tokeniser(
     adapter_base_model_id: str | None,
     trust_remote_code: bool,
     model_max_length: int,
-    model_cache_dir: str,
+    model_config: "ModelConfig",
     token: str | bool,
 ) -> "PreTrainedTokenizer":
     """Load the tokeniser.
@@ -920,8 +956,8 @@ def load_tokeniser(
             Whether to trust remote code.
         model_max_length:
             The maximum length of the model.
-        model_cache_dir:
-            The cache directory for the model.
+        model_config:
+            The model configuration.
         token:
             The Hugging Face API token.
@@ -932,7 +968,7 @@ def load_tokeniser(
     config = AutoConfig.from_pretrained(
         adapter_base_model_id or model_id,
         revision=revision,
-        cache_dir=model_cache_dir,
+        cache_dir=model_config.model_cache_dir,
         token=token,
         trust_remote_code=trust_remote_code,
         local_files_only=not internet_connection_available(),
@@ -940,15 +976,25 @@ def load_tokeniser(
     num_retries = 5
     for _ in range(num_retries):
         try:
+            # Mistral instruction-tuned models need a custom tokeniser
+            if model_id.startswith("mistralai/") and "base" not in model_id.lower():
+                tokeniser = MistralCommonTokenizer.from_pretrained(
+                    model_id,
+                    padding_side="left",
+                    truncation_side="left",
+                    model_max_length=model_max_length,
+                    token=token,
+                )
+                break
             tokeniser = AutoTokenizer.from_pretrained(
                 model_id,
-                use_fast=True,
+                use_fast=False if model_config.param == "slow-tokenizer" else True,
                 verbose=False,
                 trust_remote_code=trust_remote_code,
                 padding_side="left",
                 truncation_side="left",
                 model_max_length=model_max_length,
-                cache_dir=model_cache_dir,
+                cache_dir=model_config.model_cache_dir,
                 config=config,
                 token=token,
                 local_files_only=not internet_connection_available(),
@@ -960,13 +1006,17 @@ def load_tokeniser(
                     f"Could not load tokeniser for model {model_id!r}. The error was "
                     f"{str(e)}."
                 ) from e
-            logger.debug(
+            log(
                 f"Could not load tokeniser for {model_id!r}. Falling back to "
-                f"{adapter_base_model_id!r}."
+                f"{adapter_base_model_id!r}.",
+                level=logging.DEBUG,
             )
             model_id = adapter_base_model_id
         except (TimeoutError, RequestError):
-            logger.info(f"Couldn't load tokeniser for {model_id!r}. Retrying.")
+            log(
+                f"Couldn't load tokeniser for {model_id!r}. Retrying.",
+                level=logging.WARNING,
+            )
             sleep(5)
             continue
         except (KeyError, ValueError) as e:
@@ -1165,27 +1215,50 @@ def get_custom_stop_tokens(
         if stop_token in prompt or stop_token in completion
     ]
     if stop_tokens:
-        logger.debug(
+        log(
             f"Found the following custom stop tokens for model {model_id!r}: "
-            f"{stop_tokens}."
+            f"{stop_tokens}.",
+            level=logging.DEBUG,
         )
     else:
-        logger.debug(f"Found no custom stop tokens for model {model_id!r}.")
+        log(f"Found no custom stop tokens for model {model_id!r}.", level=logging.DEBUG)
     return stop_tokens
-def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
-    """Get a progress bar for vLLM which disappears after completion.
+def get_vllm_tokenisation_params(
+    tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
+) -> dict[str, t.Any]:
+    """Get the tokenisation parameters for vLLM.
     Args:
-        *tqdm_args:
-            Positional arguments to pass to tqdm.
-        **tqdm_kwargs:
-            Additional keyword arguments to pass to tqdm.
+        tokeniser:
+            The tokeniser.
+        model_config:
+            The model configuration.
     Returns:
-        A tqdm progress bar.
+        A dictionary of tokenisation parameters to pass to vLLM.
     """
-    tqdm_kwargs.pop("leave", None)  # Remove the 'leave' key if it exists
-    return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        tokeniser_mode = "mistral"
+    elif model_config.param == "slow-tokenizer":
+        tokeniser_mode = "slow"
+    else:
+        tokeniser_mode = "auto"
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        config_format = "mistral"
+    else:
+        config_format = "auto"
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        load_format = "mistral"
+    else:
+        load_format = "auto"
+    return dict(
+        tokenizer_mode=tokeniser_mode,
+        config_format=config_format,
+        load_format=load_format,
+    )

EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.2.2py3-none-any.whl → 16.4.0py3-none-any.whl