PyPI - EuroEval - Versions diffs - 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl - Mend

EuroEval 16.3.0py3-none-any.whl → 16.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (78) hide show

euroeval/__init__.py +9 -2
euroeval/benchmark_config_factory.py +51 -50
euroeval/benchmark_modules/base.py +9 -21
euroeval/benchmark_modules/fresh.py +2 -1
euroeval/benchmark_modules/hf.py +101 -71
euroeval/benchmark_modules/litellm.py +115 -53
euroeval/benchmark_modules/vllm.py +107 -92
euroeval/benchmarker.py +144 -121
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +86 -8
euroeval/constants.py +9 -0
euroeval/data_loading.py +80 -29
euroeval/data_models.py +338 -330
euroeval/dataset_configs/__init__.py +12 -3
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +55 -93
euroeval/dataset_configs/dutch.py +48 -87
euroeval/dataset_configs/english.py +45 -77
euroeval/dataset_configs/estonian.py +42 -34
euroeval/dataset_configs/faroese.py +19 -60
euroeval/dataset_configs/finnish.py +36 -69
euroeval/dataset_configs/french.py +39 -75
euroeval/dataset_configs/german.py +45 -82
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +54 -91
euroeval/dataset_configs/italian.py +42 -79
euroeval/dataset_configs/latvian.py +28 -35
euroeval/dataset_configs/lithuanian.py +28 -26
euroeval/dataset_configs/norwegian.py +72 -115
euroeval/dataset_configs/polish.py +33 -61
euroeval/dataset_configs/portuguese.py +33 -66
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/spanish.py +42 -77
euroeval/dataset_configs/swedish.py +52 -90
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/exceptions.py +1 -1
euroeval/finetuning.py +24 -17
euroeval/generation.py +15 -14
euroeval/generation_utils.py +8 -8
euroeval/languages.py +395 -323
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +21 -6
euroeval/metrics/llm_as_a_judge.py +6 -4
euroeval/metrics/pipeline.py +17 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +17 -19
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +99 -42
euroeval/prompt_templates/multiple_choice.py +102 -38
euroeval/prompt_templates/named_entity_recognition.py +172 -51
euroeval/prompt_templates/reading_comprehension.py +119 -42
euroeval/prompt_templates/sentiment_classification.py +110 -40
euroeval/prompt_templates/summarization.py +85 -40
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +11 -10
euroeval/speed_benchmark.py +5 -6
euroeval/task_group_utils/multiple_choice_classification.py +2 -4
euroeval/task_group_utils/question_answering.py +24 -16
euroeval/task_group_utils/sequence_classification.py +48 -35
euroeval/task_group_utils/text_to_text.py +19 -9
euroeval/task_group_utils/token_classification.py +21 -17
euroeval/tasks.py +44 -1
euroeval/tokenisation_utils.py +33 -22
euroeval/types.py +10 -9
euroeval/utils.py +35 -149
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
euroeval-16.5.0.dist-info/RECORD +81 -0
euroeval-16.3.0.dist-info/RECORD +0 -71
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -14,10 +14,9 @@ from time import sleep
 import torch
 from huggingface_hub import snapshot_download
 from pydantic import conlist, create_model
-from tqdm.auto import tqdm
-from transformers import MistralCommonTokenizer
 from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers.tokenization_mistral_common import MistralCommonTokenizer
 from urllib3.exceptions import RequestError
 from ..constants import (
@@ -30,7 +29,7 @@ from ..constants import (
     REASONING_TOKENS,
     VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
 )
-from ..data_models import GenerativeModelOutput, ModelConfig
+from ..data_models import GenerativeModelOutput, HashableDict, ModelConfig
 from ..enums import (
     BatchingPreference,
     GenerativeType,
@@ -50,6 +49,7 @@ from ..generation_utils import (
     raise_if_wrong_params,
 )
 from ..languages import get_all_languages
+from ..logging_utils import get_pbar, log, log_once, no_terminal_output
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
@@ -73,7 +73,6 @@ from ..utils import (
     get_hf_token,
     get_min_cuda_compute_capability,
     internet_connection_available,
-    log_once,
     resolve_model_path,
     split_model_id,
 )
@@ -86,7 +85,7 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
         destroy_model_parallel,
     )
     from vllm.lora.request import LoRARequest
-    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.sampling_params import StructuredOutputsParams
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -95,8 +94,6 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig, Task
-logger = logging.getLogger("euroeval")
 class VLLMModel(HuggingFaceEncoderModel):
     """A generative model using the vLLM inference framework."""
@@ -132,9 +129,10 @@ class VLLMModel(HuggingFaceEncoderModel):
             model_config=model_config, allowed_params=self.allowed_params
         )
-        model, tokeniser = load_model_and_tokeniser(
-            model_config=model_config, benchmark_config=benchmark_config
-        )
+        with no_terminal_output(disable=benchmark_config.verbose):
+            model, tokeniser = load_model_and_tokeniser(
+                model_config=model_config, benchmark_config=benchmark_config
+            )
         self._model: "LLM" = model
         self._tokeniser: "PreTrainedTokenizer" = tokeniser
@@ -245,6 +243,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                 return partial(
                     sequence_classification.extract_labels_from_generation,
                     dataset_config=self.dataset_config,
+                    model_config=self.model_config,
                     first_label_token_mapping=self.buffer["first_label_token_mapping"],
                 )
             case TaskGroup.TEXT_TO_TEXT:
@@ -394,10 +393,11 @@ class VLLMModel(HuggingFaceEncoderModel):
             self.dataset_config.task.uses_structured_output
             or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
         ) and self.generative_type == GenerativeType.REASONING:
-            guided_decoding = None
-            logger.debug(
+            structured_outputs = None
+            log(
                 "The dataset uses structured output, but we are not using it as the "
-                "model is a reasoning model."
+                "model is a reasoning model.",
+                level=logging.DEBUG,
             )
         elif self.dataset_config.task.uses_structured_output:
             ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
@@ -412,21 +412,29 @@ class VLLMModel(HuggingFaceEncoderModel):
                 f"{json.dumps(structured_generation_schema)}",
                 level=logging.DEBUG,
             )
-            guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
+            structured_outputs = StructuredOutputsParams(
+                json=structured_generation_schema
+            )
         elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
-            guided_decoding = GuidedDecodingParams(
-                choice=[
-                    self.dataset_config.prompt_label_mapping[label]
-                    for label in self.dataset_config.labels
+            choice_labels = [
+                self.dataset_config.prompt_label_mapping[label]
+                for label in self.dataset_config.labels
+            ]
+            if "first_label_token_mapping" in self.buffer and isinstance(
+                self.buffer["first_label_token_mapping"], dict
+            ):
+                choice_labels = [
+                    self.buffer["first_label_token_mapping"][label]
+                    for label in choice_labels
                 ]
-            )
+            structured_outputs = StructuredOutputsParams(choice=choice_labels)
             log_once(
                 "Using structured generation with the choices: "
-                f"{guided_decoding.choice!r}.",
+                f"{structured_outputs.choice!r}.",
                 level=logging.DEBUG,
             )
         else:
-            guided_decoding = None
+            structured_outputs = None
             log_once(
                 "Not using structured generation as the dataset does not require it.",
                 level=logging.DEBUG,
@@ -445,14 +453,14 @@ class VLLMModel(HuggingFaceEncoderModel):
             else None,
             temperature=0.0,
             stop=[stop_token for stop_token in stop_tokens if stop_token],
-            guided_decoding=guided_decoding,
+            structured_outputs=structured_outputs,
         )
         # If any of the prompts are empty then we need to replace them with a BOS token
         # so that the vLLM model can generate from them
-        prompts: list[str] = inputs["text"]
+        prompts: c.Sequence[str] = inputs["text"]
         if any(len(prompt) == 0 for prompt in prompts):
-            logger.debug("Found empty prompts, replacing with BOS token.")
+            log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
             prompts = [
                 prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
                 for prompt in prompts
@@ -480,13 +488,14 @@ class VLLMModel(HuggingFaceEncoderModel):
                 raw_outputs = self._model.generate(
                     prompts=prompts,
                     sampling_params=sampling_params,
-                    use_tqdm=False if input_is_a_test else get_pbar_without_leave,
+                    use_tqdm=False if input_is_a_test else get_pbar,
                     lora_request=self.buffer.get("lora_request"),
                 )
                 break
             except TypeError as e:
-                logger.debug(
-                    f"Encountered error during vLLM generation: {str(e)}. Retrying..."
+                log(
+                    f"Encountered error during vLLM generation: {str(e)}. Retrying...",
+                    level=logging.DEBUG,
                 )
                 sleep(1)
             except ValueError as e:
@@ -498,10 +507,11 @@ class VLLMModel(HuggingFaceEncoderModel):
                     re.search(pattern, str(e), flags=re.IGNORECASE) is not None
                     for pattern in truncate_error_messages
                 ):
-                    logger.info(
-                        "Prompts are too long, so truncating them and trying again..."
+                    log(
+                        "Prompts are too long, so truncating them and trying again...",
+                        level=logging.WARNING,
                     )
-                    logger.debug(f"The error message was: {str(e)}")
+                    log(f"The error message was: {str(e)}", level=logging.DEBUG)
                     # If we have already tried truncating the prompts a few times, then
                     # we truncate a bit more aggressively
@@ -544,49 +554,50 @@ class VLLMModel(HuggingFaceEncoderModel):
                     f"{num_extra_outputs!r} extra outputs."
                 )
             else:
-                logger.debug(
+                log(
                     f"Filtered out {num_extra_outputs:,} extra outputs from the model, "
                     "which occured as we interupted the generation when we truncated "
-                    "the prompts."
+                    "the prompts.",
+                    level=logging.DEBUG,
                 )
         # Parse the raw model outputs
-        completion_ids: list[list[int]] = [
-            output.outputs[0].token_ids for output in raw_outputs
+        completion_ids: c.Sequence[c.Sequence[int]] = [
+            list(output.outputs[0].token_ids) for output in raw_outputs
         ]
         completions = self._tokeniser.batch_decode(
             sequences=[
                 torch.LongTensor(completion_id) for completion_id in completion_ids
-            ]
+            ],
+            skip_special_tokens=True,
         )
         if (
             self.end_of_reasoning_token is not None
             and self.generative_type == GenerativeType.REASONING
         ):
+            num_samples_without_eor_token = 0
             for idx in range(len(completions)):
                 if self.end_of_reasoning_token in completions[idx]:
                     completions[idx] = completions[idx].split(
                         self.end_of_reasoning_token
                     )[-1]
-                elif self.benchmark_config.verbose:
-                    logger.warning(
-                        f"The model {self.model_config.model_id!r} is a reasoning "
-                        "model, but the generated output does not contain the end of "
-                        f"reasoning token ({self.end_of_reasoning_token!r}). Using "
-                        "an empty string as the prediction instead."
-                    )
-                    completions[idx] = ""
                 else:
-                    log_once(
-                        f"The model {self.model_config.model_id!r} is a reasoning "
-                        "model, but the generated output does not contain the end of "
-                        f"reasoning token ({self.end_of_reasoning_token!r}). Using "
-                        "an empty string as the prediction instead. Only showing "
-                        "this warning once - see all occurrences if you run with the "
-                        "`verbose` flag.",
-                        level=logging.WARNING,
-                    )
+                    num_samples_without_eor_token += 1
                     completions[idx] = ""
+            if num_samples_without_eor_token > 0:
+                log_once(
+                    f"The model {self.model_config.model_id!r} is a reasoning "
+                    "model, but the generated output did not contain the end of "
+                    f"reasoning token ({self.end_of_reasoning_token!r}) in "
+                    f"{num_samples_without_eor_token:,}/{len(completions):,} of "
+                    "the samples. Using an empty string for all these samples "
+                    "instead.",
+                    level=(
+                        logging.WARNING
+                        if num_samples_without_eor_token / len(completions) > 0.5
+                        else logging.DEBUG
+                    ),
+                )
         stop_token_pattern = re.compile(
             "|".join(re.escape(stop_token) for stop_token in stop_tokens)
         )
@@ -604,13 +615,13 @@ class VLLMModel(HuggingFaceEncoderModel):
         # Add logprobs scores to the output
         if self.buffer["first_label_token_mapping"]:
-            scores: list[list[list[tuple[str, float]]]] = [
+            scores: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]] = [
                 [
                     [
-                        (obj.decoded_token, obj.logprob)
+                        (obj.decoded_token or "", obj.logprob)
                         for obj in token_logprobs_dict.values()
                     ]
-                    for token_logprobs_dict in raw_output.outputs[0].logprobs
+                    for token_logprobs_dict in raw_output.outputs[0].logprobs or list()
                 ]
                 for raw_output in raw_outputs
             ]
@@ -648,7 +659,13 @@ class VLLMModel(HuggingFaceEncoderModel):
         revision = model_id_components.revision
         model_info = get_model_repo_info(
-            model_id=model_id, revision=revision, benchmark_config=benchmark_config
+            model_id=model_id,
+            revision=revision,
+            api_key=benchmark_config.api_key,
+            cache_dir=benchmark_config.cache_dir,
+            trust_remote_code=benchmark_config.trust_remote_code,
+            requires_safetensors=benchmark_config.requires_safetensors,
+            run_with_cli=benchmark_config.run_with_cli,
         )
         return (
             model_info is not None
@@ -674,7 +691,11 @@ class VLLMModel(HuggingFaceEncoderModel):
         model_info = get_model_repo_info(
             model_id=model_id_components.model_id,
             revision=model_id_components.revision,
-            benchmark_config=benchmark_config,
+            api_key=benchmark_config.api_key,
+            cache_dir=benchmark_config.cache_dir,
+            trust_remote_code=benchmark_config.trust_remote_code,
+            requires_safetensors=benchmark_config.requires_safetensors,
+            run_with_cli=benchmark_config.run_with_cli,
         )
         if model_info is None:
             raise InvalidModel(f"The model {model_id!r} could not be found.")
@@ -705,7 +726,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         return model_config
     @property
-    def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
+    def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.
         Returns:
@@ -751,8 +772,8 @@ def load_model_and_tokeniser(
     hf_model_config = load_hf_model_config(
         model_id=model_id,
         num_labels=0,
-        id2label=dict(),
-        label2id=dict(),
+        id2label=HashableDict(),
+        label2id=HashableDict(),
         revision=revision,
         model_cache_dir=model_config.model_cache_dir,
         api_key=benchmark_config.api_key,
@@ -779,32 +800,36 @@ def load_model_and_tokeniser(
     # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
     if hf_model_config.dtype == torch.float32:
         if torch.cuda.is_bf16_supported():
-            logger.info(
+            log(
                 "You are loading a model with dtype FP32, which we will convert to "
                 "BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
-                "GPU."
+                "GPU.",
+                level=logging.WARNING,
             )
             dtype = torch.bfloat16
         else:
-            logger.info(
+            log(
                 "You are loading a model with dtype FP32, which we will convert to "
                 "FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
-                "your GPU."
+                "your GPU.",
+                level=logging.WARNING,
             )
             dtype = torch.float16
     # If the model is a quantized model, we might need to change the dtype
     if quantization == "mxfp4" and hf_model_config.dtype is None:
         dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
-        logger.debug(
+        log(
             "You are loading a quantized model where `dtype` has not been set. "
-            f"Setting dtype to {dtype!r}."
+            f"Setting dtype to {dtype!r}.",
+            level=logging.DEBUG,
         )
     elif quantization is not None and hf_model_config.dtype != torch.float16:
-        logger.info(
+        log(
             "You are loading a quantized model with dtype "
             f"{hf_model_config.dtype}, which vLLM does not support. Setting "
-            "dtype to float16 instead."
+            "dtype to float16 instead.",
+            level=logging.WARNING,
         )
         dtype = torch.float16
@@ -815,12 +840,13 @@ def load_model_and_tokeniser(
         if min_cuda_compute_capability is not None:
             if min_cuda_compute_capability < required_capability:
-                logger.info(
+                log(
                     f"You are loading a model with dtype {hf_model_config.dtype}, "
                     "which vLLM only supports for CUDA devices with CUDA compute "
                     f"capability >={required_capability}. You are using one or more "
                     f"devices with compute capability {min_cuda_compute_capability}. "
-                    "Setting dtype to float16 instead."
+                    "Setting dtype to float16 instead.",
+                    level=logging.WARNING,
                 )
                 dtype = torch.float16
@@ -987,13 +1013,17 @@ def load_tokeniser(
                     f"Could not load tokeniser for model {model_id!r}. The error was "
                     f"{str(e)}."
                 ) from e
-            logger.debug(
+            log(
                 f"Could not load tokeniser for {model_id!r}. Falling back to "
-                f"{adapter_base_model_id!r}."
+                f"{adapter_base_model_id!r}.",
+                level=logging.DEBUG,
             )
             model_id = adapter_base_model_id
         except (TimeoutError, RequestError):
-            logger.info(f"Couldn't load tokeniser for {model_id!r}. Retrying.")
+            log(
+                f"Couldn't load tokeniser for {model_id!r}. Retrying.",
+                level=logging.WARNING,
+            )
             sleep(5)
             continue
         except (KeyError, ValueError) as e:
@@ -1192,32 +1222,17 @@ def get_custom_stop_tokens(
         if stop_token in prompt or stop_token in completion
     ]
     if stop_tokens:
-        logger.debug(
+        log(
             f"Found the following custom stop tokens for model {model_id!r}: "
-            f"{stop_tokens}."
+            f"{stop_tokens}.",
+            level=logging.DEBUG,
         )
     else:
-        logger.debug(f"Found no custom stop tokens for model {model_id!r}.")
+        log(f"Found no custom stop tokens for model {model_id!r}.", level=logging.DEBUG)
     return stop_tokens
-def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
-    """Get a progress bar for vLLM which disappears after completion.
-    Args:
-        *tqdm_args:
-            Positional arguments to pass to tqdm.
-        **tqdm_kwargs:
-            Additional keyword arguments to pass to tqdm.
-    Returns:
-        A tqdm progress bar.
-    """
-    tqdm_kwargs.pop("leave", None)  # Remove the 'leave' key if it exists
-    return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
 def get_vllm_tokenisation_params(
     tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
 ) -> dict[str, t.Any]:

EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.3.0py3-none-any.whl → 16.5.0py3-none-any.whl