PyPI - EuroEval - Versions diffs - 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl - Mend

EuroEval 15.4.2py3-none-any.whl → 15.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (54) hide show

euroeval/__init__.py +2 -2
euroeval/benchmark_modules/base.py +3 -2
euroeval/benchmark_modules/fresh.py +8 -6
euroeval/benchmark_modules/hf.py +44 -33
euroeval/benchmark_modules/litellm.py +314 -120
euroeval/benchmark_modules/vllm.py +99 -59
euroeval/benchmarker.py +52 -21
euroeval/callbacks.py +2 -2
euroeval/constants.py +9 -2
euroeval/data_models.py +258 -44
euroeval/dataset_configs/__init__.py +61 -0
euroeval/dataset_configs/danish.py +120 -0
euroeval/dataset_configs/dutch.py +123 -0
euroeval/dataset_configs/english.py +88 -0
euroeval/dataset_configs/faroese.py +53 -0
euroeval/dataset_configs/french.py +83 -0
euroeval/dataset_configs/german.py +91 -0
euroeval/dataset_configs/icelandic.py +148 -0
euroeval/dataset_configs/italian.py +81 -0
euroeval/dataset_configs/norwegian.py +178 -0
euroeval/dataset_configs/spanish.py +78 -0
euroeval/dataset_configs/swedish.py +100 -0
euroeval/exceptions.py +10 -10
euroeval/finetuning.py +6 -10
euroeval/generation.py +1 -0
euroeval/human_evaluation.py +2 -2
euroeval/languages.py +20 -13
euroeval/model_cache.py +1 -1
euroeval/model_loading.py +1 -12
euroeval/prompt_templates/__init__.py +8 -0
euroeval/prompt_templates/linguistic_acceptability.py +112 -0
euroeval/prompt_templates/multiple_choice.py +97 -0
euroeval/prompt_templates/named_entity_recognition.py +257 -0
euroeval/prompt_templates/reading_comprehension.py +118 -0
euroeval/prompt_templates/sentiment_classification.py +137 -0
euroeval/prompt_templates/summarization.py +97 -0
euroeval/speed_benchmark.py +1 -1
euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
euroeval/tasks.py +54 -0
euroeval/tokenization_utils.py +343 -0
euroeval/types.py +3 -1
euroeval/utils.py +5 -254
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
euroeval-15.6.0.dist-info/RECORD +59 -0
euroeval/dataset_configs.py +0 -2408
euroeval-15.4.2.dist-info/RECORD +0 -40
/euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Generative models using the vLLM inference framework."""
 import collections.abc as c
+import contextlib
 import importlib.util
 import itertools as it
 import json
@@ -20,15 +21,18 @@ from datasets import DatasetDict
 from huggingface_hub import snapshot_download
 from pydantic import conlist, create_model
 from tqdm.auto import tqdm
-from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, Trainer
+from transformers.models.auto.configuration_auto import AutoConfig
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.trainer import Trainer
 from urllib3.exceptions import RequestError
 from ..constants import (
     GENERATIVE_PIPELINE_TAGS,
+    MAX_CONTEXT_LENGTH,
     MAX_LOGPROBS,
     MERGE_TAGS,
     REASONING_MAX_TOKENS,
-    TASK_GROUPS_USING_LOGPROBS,
     TASKS_USING_JSON,
     VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
 )
@@ -53,39 +57,39 @@ from ..exceptions import (
     NeedsExtraInstalled,
 )
 from ..languages import get_all_languages
-from ..task_utils import (
+from ..task_group_utils import (
     question_answering,
     sequence_classification,
     text_to_text,
     token_classification,
 )
+from ..tokenization_utils import (
+    get_bos_token,
+    get_end_of_chat_token_ids,
+    get_eos_token,
+    get_first_label_token_mapping,
+    should_prompts_be_stripped,
+)
 from ..types import ExtractLabelsFunction
 from ..utils import (
     clear_memory,
     create_model_cache_dir,
-    get_bos_token,
-    get_end_of_chat_token_ids,
-    get_eos_token,
     get_min_cuda_compute_capability,
     log_once,
-    should_prompts_be_stripped,
 )
 from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
 if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
     from vllm import LLM, RequestOutput, SamplingParams
+    from vllm.distributed.parallel_state import (
+        destroy_distributed_environment,
+        destroy_model_parallel,
+    )
     from vllm.lora.request import LoRARequest
-    try:
-        from vllm.model_executor.parallel_utils.parallel_state import (
-            destroy_model_parallel,
-        )
-    except ImportError:
-        from vllm.distributed.parallel_state import destroy_model_parallel
 if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
     from outlines.models.vllm import adapt_tokenizer
-    from outlines.processors import JSONLogitsProcessor
+    from outlines.processors.structured import JSONLogitsProcessor
 if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
     import ray
@@ -122,11 +126,8 @@ class VLLMModel(HuggingFaceEncoderModel):
         ):
             raise NeedsExtraInstalled(extra="generative")
-        output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
         model, tokenizer = load_model_and_tokenizer(
-            model_config=model_config,
-            benchmark_config=benchmark_config,
-            output_scores=output_scores,
+            model_config=model_config, benchmark_config=benchmark_config
         )
         self._model: LLM = model
         self._tokenizer: PreTrainedTokenizer = tokenizer
@@ -142,8 +143,12 @@ class VLLMModel(HuggingFaceEncoderModel):
             benchmark_config=benchmark_config,
         )
-        self.buffer["output_scores"] = output_scores
-        self.buffer["instruction_model"] = self._tokenizer.chat_template is not None
+        self.buffer |= dict(
+            instruction_model=self._tokenizer.chat_template is not None,
+            first_label_token_mapping=get_first_label_token_mapping(
+                dataset_config=self.dataset_config, tokenizer=self._tokenizer
+            ),
+        )
         if self.model_config.adapter_base_model_id is not None:
             adapter_path = snapshot_download(
                 repo_id=self.model_config.model_id,
@@ -154,6 +159,14 @@ class VLLMModel(HuggingFaceEncoderModel):
                 lora_name="adapter", lora_int_id=1, lora_path=adapter_path
             )
+    def __del__(self) -> None:
+        """Clean up the model and tokenizer."""
+        clear_vllm()
+        if hasattr(self, "_model"):
+            del self._model
+        if hasattr(self, "_tokenizer"):
+            del self._tokenizer
     @property
     def generative_type(self) -> GenerativeType | None:
         """Get the generative type of the model.
@@ -185,6 +198,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                 return partial(
                     sequence_classification.extract_labels_from_generation,
                     dataset_config=self.dataset_config,
+                    first_label_token_mapping=self.buffer["first_label_token_mapping"],
                 )
             case TaskGroup.TEXT_TO_TEXT:
                 return text_to_text.extract_labels_from_generation
@@ -327,7 +341,7 @@ class VLLMModel(HuggingFaceEncoderModel):
             pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
             logits_processor = JSONLogitsProcessor(
                 schema=pydantic_class,
-                tokenizer=adapt_tokenizer(tokenizer=self._tokenizer),  #  type: ignore
+                tokenizer=adapt_tokenizer(tokenizer=self._tokenizer),  # type: ignore
                 whitespace_pattern=r" ?",
             )
             log_once(
@@ -338,6 +352,12 @@ class VLLMModel(HuggingFaceEncoderModel):
         else:
             logits_processor = None
+        # Get the mapping from labels to the first token in the label. We call this each
+        # time we generate a new dataset since the dataset config can change
+        self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
+            dataset_config=self.dataset_config, tokenizer=self._tokenizer
+        )
         # Define the parameters used for vLLM generation
         max_tokens: int = (
             REASONING_MAX_TOKENS
@@ -346,7 +366,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         )
         sampling_params = SamplingParams(
             max_tokens=max_tokens,
-            logprobs=MAX_LOGPROBS if self.buffer["output_scores"] else None,
+            logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
             temperature=0.0,
             stop=[stop_token for stop_token in stop_tokens if stop_token],
             logits_processors=[logits_processor] if logits_processor else None,
@@ -416,7 +436,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         completions = [completion.strip() for completion in completions]
         # Add logprobs scores to the output
-        if self.buffer["output_scores"]:
+        if self.buffer["first_label_token_mapping"]:
             scores: list[list[list[tuple[str, float]]]] = [
                 [
                     [
@@ -846,7 +866,7 @@ class VLLMModel(HuggingFaceEncoderModel):
 def load_model_and_tokenizer(
-    model_config: ModelConfig, benchmark_config: BenchmarkConfig, output_scores: bool
+    model_config: ModelConfig, benchmark_config: BenchmarkConfig
 ) -> "tuple[LLM, PreTrainedTokenizer]":
     """Load the model and tokenizer.
@@ -855,11 +875,9 @@ def load_model_and_tokenizer(
             The model configuration.
         benchmark_config:
             The benchmark configuration.
-        output_scores:
-            Whether to output scores.
     Returns:
-        The loaded model and tokenizer.
+        A pair (model, tokenizer), with the loaded model and tokenizer
     """
     # Prefer base model ID if the model is an adapter - the adapter will be added on
     # during inference in this case
@@ -893,7 +911,27 @@ def load_model_and_tokenizer(
     if quantization == "awq" and importlib.util.find_spec("awq") is None:
         raise NeedsExtraInstalled(extra="quantization")
+    # Start with dtype being the "auto" vLLM dtype
     dtype: str | torch.dtype = "auto"
+    # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
+    if hf_model_config.torch_dtype == torch.float32:
+        if torch.cuda.is_bf16_supported():
+            logger.info(
+                "You are loading a model with dtype FP32, which we will convert to "
+                "BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
+                "GPU."
+            )
+            dtype = torch.bfloat16
+        else:
+            logger.info(
+                "You are loading a model with dtype FP32, which we will convert to "
+                "FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
+                "your GPU."
+            )
+            dtype = torch.float16
+    # If the model is a quantized model, we need to set the dtype to float16
     if quantization is not None and hf_model_config.torch_dtype != torch.float16:
         logger.info(
             "You are loading a quantized model with dtype "
@@ -902,6 +940,7 @@ def load_model_and_tokenizer(
         )
         dtype = torch.float16
+    # If the model is a bf16 model, we need to check the CUDA compute capability
     if hf_model_config.torch_dtype == torch.bfloat16:
         min_cuda_compute_capability = get_min_cuda_compute_capability()
         required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
@@ -940,29 +979,38 @@ def load_model_and_tokenizer(
     if len(true_max_model_len_candidates) > 0:
         true_max_model_len = min(true_max_model_len_candidates)
     else:
-        true_max_model_len = 5_000
+        true_max_model_len = MAX_CONTEXT_LENGTH
-    clear_vllm()
+    tokenizer = load_tokenizer(
+        model_id=model_config.model_id,
+        revision=model_config.revision,
+        adapter_base_model_id=model_config.adapter_base_model_id,
+        trust_remote_code=benchmark_config.trust_remote_code,
+        model_max_length=true_max_model_len,
+        model_cache_dir=model_config.model_cache_dir,
+        token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+    )
-    executor_backend = "ray" if torch.cuda.device_count() > 1 else "mp"
+    clear_vllm()
     try:
         model = LLM(
             model=model_id,
             tokenizer=model_id,
-            gpu_memory_utilization=0.95,
-            max_model_len=min(true_max_model_len, 5_000),
+            gpu_memory_utilization=0.9,
+            max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
             download_dir=download_dir,
             trust_remote_code=benchmark_config.trust_remote_code,
             revision=revision,
             seed=4242,
-            distributed_executor_backend=executor_backend,
+            distributed_executor_backend=(
+                "ray" if torch.cuda.device_count() > 1 else "mp"
+            ),
             tensor_parallel_size=torch.cuda.device_count(),
             disable_custom_all_reduce=True,
             quantization=quantization,
             dtype=dtype,
             enforce_eager=True,
-            max_logprobs=MAX_LOGPROBS if output_scores else None,
             # TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
             # so we disable it for now
             enable_prefix_caching=False,
@@ -988,16 +1036,6 @@ def load_model_and_tokenizer(
     model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
     model.config = hf_model_config
-    tokenizer = load_tokenizer(
-        model_id=model_config.model_id,
-        revision=model_config.revision,
-        adapter_base_model_id=model_config.adapter_base_model_id,
-        trust_remote_code=benchmark_config.trust_remote_code,
-        model_max_length=true_max_model_len,
-        model_cache_dir=model_config.model_cache_dir,
-        token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
-    )
     return model, tokenizer
@@ -1118,13 +1156,16 @@ def _run_engine_with_fixed_progress_bars(
 def clear_vllm() -> None:
     """Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
-    try:
+    with contextlib.suppress(ValueError):
         destroy_model_parallel()
-    except ImportError:
-        pass
-    clear_memory()
+        destroy_distributed_environment()
+    if ray.is_initialized():
+        ray.shutdown()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
     if ray.is_initialized():
         ray.shutdown()
+    clear_memory()
 def get_end_of_reasoning_token_id(
@@ -1148,24 +1189,23 @@ def get_end_of_reasoning_token_id(
     if tokenizer.chat_template is None:
         prompt = "What is your name?"
     else:
-        prompt = tokenizer.apply_chat_template(
+        templated_prompt = tokenizer.apply_chat_template(
             conversation=[dict(role="user", content="What is your name?")],
             add_generation_prompt=True,
             tokenize=False,
         )
-    assert isinstance(prompt, str)
+        assert isinstance(templated_prompt, str)
+        prompt = templated_prompt
     # Generate a completion and remove the BOS token from it, to not confuse it with the
     # potential reasoning token
-    completion = (
-        model.generate(
-            prompts=[prompt],
-            sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
-            use_tqdm=False,
-        )[0]
-        .outputs[0]
-        .text
+    model_output = model.generate(
+        prompts=[prompt],
+        sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
+        use_tqdm=False,
     )
+    completion = model_output[0].outputs[0].text
     if tokenizer.bos_token is not None:
         if isinstance(tokenizer.bos_token, str):
             prompt = prompt.replace(tokenizer.bos_token, "").strip()

euroeval/benchmarker.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Class that benchmarks language models."""
+import contextlib
 import json
 import logging
 import re
@@ -13,7 +14,7 @@ from time import sleep
 from torch.distributed import destroy_process_group
 from .benchmark_config_factory import build_benchmark_config
-from .constants import GENERATIVE_PIPELINE_TAGS
+from .constants import GENERATIVE_DATASET_TASK_GROUPS, GENERATIVE_PIPELINE_TAGS
 from .data_loading import load_data
 from .data_models import BenchmarkConfigParams, BenchmarkResult
 from .dataset_configs import get_all_dataset_configs
@@ -366,14 +367,18 @@ class Benchmarker:
             dataset_names=benchmark_config.datasets
         )
+        total_benchmarks = len(model_ids) * len(dataset_configs)
+        num_finished_benchmarks = 0
         current_benchmark_results: list[BenchmarkResult] = list()
-        for m_id in model_ids:
+        for model_id in model_ids:
             try:
                 model_config = get_model_config(
-                    model_id=m_id, benchmark_config=benchmark_config
+                    model_id=model_id, benchmark_config=benchmark_config
                 )
             except InvalidModel as e:
                 logger.info(e.message)
+                num_finished_benchmarks += len(dataset_configs)
                 continue
             loaded_model: BenchmarkModule | None = None
@@ -381,21 +386,35 @@ class Benchmarker:
                 # Skip if we have already benchmarked this model on this dataset and
                 # we are not forcing the benchmark
                 if not benchmark_config.force and model_has_been_benchmarked(
-                    model_id=m_id,
+                    model_id=model_id,
                     dataset=dataset_config.name,
                     few_shot=benchmark_config.few_shot,
                     validation_split=not benchmark_config.evaluate_test_split,
                     benchmark_results=self.benchmark_results,
                 ):
                     logger.debug(
-                        f"Skipping benchmarking {m_id} on {dataset_config.pretty_name},"
-                        " as it has already been benchmarked."
+                        f"Skipping benchmarking {model_id} on "
+                        f"{dataset_config.pretty_name}, as it "
+                        "has already been benchmarked."
+                    )
+                    num_finished_benchmarks += 1
+                    continue
+                # Skip if the model is an encoder model and the task is generative
+                task_is_generative = (
+                    dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
+                )
+                if model_config.model_type == ModelType.ENCODER and task_is_generative:
+                    logger.debug(
+                        f"Skipping benchmarking {model_id} on "
+                        f"{dataset_config.pretty_name}, as it is an encoder model and "
+                        "the task is generative."
                     )
                     continue
                 # We do not re-initialise generative models as their architecture is not
                 # customised to specific datasets
-                if model_config.task in GENERATIVE_PIPELINE_TAGS:
+                if model_config.model_type == ModelType.GENERATIVE:
                     initial_logging(
                         model_config=model_config,
                         dataset_config=dataset_config,
@@ -413,6 +432,15 @@ class Benchmarker:
                             if benchmark_config.raise_errors:
                                 raise e
                             logger.info(e.message)
+                            # Add the remaining number of benchmarks for the model to
+                            # our benchmark counter, since we're skipping the rest of
+                            # them
+                            num_finished_benchmarks += (
+                                len(dataset_configs)
+                                - dataset_configs.index(dataset_config)
+                                - 1
+                            )
                             break
                     else:
                         loaded_model.dataset_config = dataset_config
@@ -432,27 +460,33 @@ class Benchmarker:
                     raise benchmark_output_or_err
                 elif isinstance(benchmark_output_or_err, InvalidBenchmark):
-                    if benchmark_config.raise_errors:
-                        raise benchmark_output_or_err
-                    logger.info(
-                        f"{m_id} could not be benchmarked on "
-                        f"{dataset_config.pretty_name}. Skipping. The error message "
-                        f"raised was {benchmark_output_or_err.message!r}."
-                    )
+                    logger.info(benchmark_output_or_err.message)
+                    num_finished_benchmarks += 1
                     continue
                 elif isinstance(benchmark_output_or_err, InvalidModel):
-                    if benchmark_config.raise_errors:
-                        raise benchmark_output_or_err
                     logger.info(benchmark_output_or_err.message)
+                    # Add the remaining number of benchmarks for the model to our
+                    # benchmark counter, since we're skipping the rest of them
+                    num_finished_benchmarks += (
+                        len(dataset_configs) - dataset_configs.index(dataset_config) - 1
+                    )
                     break
                 else:
-                    record = benchmark_output_or_err
+                    record: BenchmarkResult = benchmark_output_or_err
                     current_benchmark_results.append(record)
                     if benchmark_config.save_results:
                         record.append_to_results(results_path=self.results_path)
+                num_finished_benchmarks += 1
+                logger.info(
+                    f"Finished {num_finished_benchmarks} out of "
+                    f"{total_benchmarks} benchmarks."
+                )
+            del loaded_model
             if benchmark_config.clear_model_cache:
                 clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
@@ -464,11 +498,8 @@ class Benchmarker:
         #   point and block the progress of another member of the process group. This
         #   constraint has always been present,  but this warning has only been added
         #   since PyTorch 2.4 (function operator())
-        try:
+        with contextlib.suppress(AssertionError):
             destroy_process_group()
-        except AssertionError:
-            pass
         return current_benchmark_results
     def _get_updated_benchmark_config(

euroeval/callbacks.py CHANGED Viewed

@@ -5,8 +5,8 @@ from collections.abc import Sized
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
-from transformers import TrainerControl, TrainerState, TrainingArguments
-from transformers.trainer_callback import ProgressCallback
+from transformers.trainer_callback import ProgressCallback, TrainerControl, TrainerState
+from transformers.training_args import TrainingArguments
 class NeverLeaveProgressCallback(ProgressCallback):

euroeval/constants.py CHANGED Viewed

@@ -7,6 +7,13 @@ from .tasks import NER
 DUMMY_FILL_VALUE = 100
+# This is the maximum allowed context length for models for the purpose of this
+# benchmark. We will still report the models' true maximum context length in the
+# metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
+# all tokens in the context.
+MAX_CONTEXT_LENGTH = 5_000
 # We need to raise the amount of tokens generated for reasoning models, to give them
 # time to think
 REASONING_MAX_TOKENS = 8_192
@@ -44,10 +51,10 @@ TASK_GROUPS_USING_LOGPROBS = [
 # The number of top log probabilities to return for generative models. For several APIs
 # this is the maximum number of log probabilities that can be returned
-MAX_LOGPROBS = 10
+MAX_LOGPROBS = 8
-# We make sure to remove these metric attributed after each iteration, to avoid memory
+# We make sure to remove these metric attributes after each iteration, to avoid memory
 # leaks
 METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]

EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.4.2py3-none-any.whl → 15.6.0py3-none-any.whl