PyPI - EuroEval - Versions diffs - 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl - Mend

EuroEval 16.0.0py3-none-any.whl → 16.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (51) hide show

euroeval/__init__.py +5 -0
euroeval/benchmark_config_factory.py +6 -1
euroeval/benchmark_modules/base.py +2 -0
euroeval/benchmark_modules/fresh.py +7 -1
euroeval/benchmark_modules/hf.py +26 -21
euroeval/benchmark_modules/litellm.py +258 -131
euroeval/benchmark_modules/vllm.py +120 -68
euroeval/benchmarker.py +11 -2
euroeval/cli.py +14 -1
euroeval/constants.py +7 -1
euroeval/data_models.py +95 -20
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +14 -3
euroeval/dataset_configs/dutch.py +14 -0
euroeval/dataset_configs/english.py +22 -0
euroeval/dataset_configs/estonian.py +15 -7
euroeval/dataset_configs/finnish.py +14 -0
euroeval/dataset_configs/french.py +14 -0
euroeval/dataset_configs/german.py +23 -0
euroeval/dataset_configs/italian.py +14 -0
euroeval/dataset_configs/latvian.py +14 -0
euroeval/dataset_configs/norwegian.py +14 -0
euroeval/dataset_configs/polish.py +126 -0
euroeval/dataset_configs/portuguese.py +14 -0
euroeval/dataset_configs/spanish.py +14 -0
euroeval/dataset_configs/swedish.py +25 -0
euroeval/enums.py +12 -0
euroeval/generation.py +17 -8
euroeval/generation_utils.py +102 -16
euroeval/metrics/pipeline.py +51 -9
euroeval/model_cache.py +13 -1
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +27 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +11 -0
euroeval/prompt_templates/sentiment_classification.py +15 -0
euroeval/prompt_templates/summarization.py +27 -1
euroeval/scores.py +5 -0
euroeval/task_group_utils/multiple_choice_classification.py +2 -2
euroeval/task_group_utils/question_answering.py +29 -29
euroeval/task_group_utils/sequence_classification.py +71 -81
euroeval/task_group_utils/token_classification.py +17 -3
euroeval/tasks.py +12 -10
euroeval/{tokenization_utils.py → tokenisation_utils.py} +41 -25
euroeval/utils.py +67 -3
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/METADATA +3 -1
euroeval-16.1.0.dist-info/RECORD +70 -0
euroeval-16.0.0.dist-info/RECORD +0 -69
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -44,7 +44,11 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
-from ..generation_utils import apply_prompt, extract_few_shot_examples
+from ..generation_utils import (
+    apply_prompt,
+    extract_few_shot_examples,
+    raise_if_wrong_params,
+)
 from ..languages import get_all_languages
 from ..task_group_utils import (
     question_answering,
@@ -52,7 +56,7 @@ from ..task_group_utils import (
     text_to_text,
     token_classification,
 )
-from ..tokenization_utils import (
+from ..tokenisation_utils import (
     apply_chat_template,
     get_bos_token,
     get_end_of_chat_token_ids,
@@ -69,6 +73,7 @@ from ..utils import (
     get_hf_token,
     get_min_cuda_compute_capability,
     log_once,
+    split_model_id,
 )
 from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
@@ -97,6 +102,7 @@ class VLLMModel(HuggingFaceEncoderModel):
     fresh_model = False
     batching_preference = BatchingPreference.ALL_AT_ONCE
     high_priority = True
+    allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
     def __init__(
         self,
@@ -120,42 +126,46 @@ class VLLMModel(HuggingFaceEncoderModel):
         if importlib.util.find_spec("vllm") is None:
             raise NeedsExtraInstalled(extra="generative")
+        raise_if_wrong_params(
+            model_config=model_config, allowed_params=self.allowed_params
+        )
         model, tokeniser = load_model_and_tokeniser(
             model_config=model_config, benchmark_config=benchmark_config
         )
         self._model: "LLM" = model
         self._tokeniser: "PreTrainedTokenizer" = tokeniser
+        # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
+        # to call the `__init__` method of the `BenchmarkModule` class.
+        super(HuggingFaceEncoderModel, self).__init__(
+            model_config=model_config,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
+            log_metadata=log_metadata,
+        )
         self.end_of_reasoning_token = get_end_of_reasoning_token(
             model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
         )
         self.end_of_chat_token_ids = get_end_of_chat_token_ids(
-            tokeniser=self._tokeniser
+            tokeniser=self._tokeniser, generative_type=self.generative_type
         )
         self.custom_stop_tokens = get_custom_stop_tokens(
             model=self._model,
             tokeniser=self._tokeniser,
             model_id=model_config.model_id,
-            is_reasoning_model=self.end_of_reasoning_token is not None,
-        )
-        # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
-        # to call the `__init__` method of the `BenchmarkModule` class.
-        super(HuggingFaceEncoderModel, self).__init__(
-            model_config=model_config,
-            dataset_config=dataset_config,
-            benchmark_config=benchmark_config,
-            log_metadata=log_metadata,
+            generative_type=self.generative_type,
         )
         self.buffer |= dict(
-            instruction_model=has_chat_template(tokeniser=self._tokeniser),
             first_label_token_mapping=get_first_label_token_mapping(
                 dataset_config=self.dataset_config,
                 model_config=self.model_config,
                 tokeniser=self._tokeniser,
                 generative_type=self.generative_type,
                 log_metadata=self.log_metadata,
-            ),
+            )
         )
         if self.model_config.adapter_base_model_id is not None:
             adapter_path = snapshot_download(
@@ -187,16 +197,36 @@ class VLLMModel(HuggingFaceEncoderModel):
             The generative type of the model, or None if it has not been set yet.
         """
         if not hasattr(self, "_tokeniser"):
+            log_once(
+                "The generative type of the model has not been set yet as the "
+                "tokeniser has not been loaded.",
+                level=logging.DEBUG,
+            )
             return None
-        elif self.end_of_reasoning_token is not None:
-            return GenerativeType.REASONING
+        elif self.benchmark_config.generative_type is not None:
+            type_ = self.benchmark_config.generative_type
+        elif self.model_config.param in {"thinking"}:
+            type_ = GenerativeType.REASONING
+        elif self.model_config.param in {"no-thinking"}:
+            type_ = GenerativeType.INSTRUCTION_TUNED
+        elif (
+            hasattr(self, "end_of_reasoning_token")
+            and self.end_of_reasoning_token is not None
+        ):
+            type_ = GenerativeType.REASONING
         elif (
             has_chat_template(tokeniser=self._tokeniser)
             or "instruct" in self.model_config.model_id.lower()
         ):
-            return GenerativeType.INSTRUCTION_TUNED
+            type_ = GenerativeType.INSTRUCTION_TUNED
         else:
-            return GenerativeType.BASE
+            type_ = GenerativeType.BASE
+        log_once(
+            f"Detected generative type {type_.name!r} for model "
+            f"{self.model_config.model_id!r}",
+            level=logging.DEBUG,
+        )
+        return type_
     @property
     def extract_labels_from_generation(self) -> ExtractLabelsFunction:
@@ -285,7 +315,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                 few_shot_examples=few_shot_examples,
                 model_config=self.model_config,
                 dataset_config=self.dataset_config,
-                instruction_model=self.buffer["instruction_model"],
+                generative_type=self.generative_type,
                 always_populate_text_field=True,
                 tokeniser=self._tokeniser,
             ),
@@ -313,7 +343,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         """
         # Get stopping tokens
         stop_tokens: list[str] = self.custom_stop_tokens.copy()
-        if self.buffer["instruction_model"] is False:
+        if self.generative_type == GenerativeType.BASE:
             stop_tokens.append("\n\n")
         if self._tokeniser.pad_token_id is not None:
             assert isinstance(self._tokeniser.pad_token, str), (
@@ -337,31 +367,6 @@ class VLLMModel(HuggingFaceEncoderModel):
             if end_of_chat_token:
                 stop_tokens.append(end_of_chat_token)
-        structured_generation_schema = None
-        if self.dataset_config.task.uses_structured_output:
-            if self.generative_type == GenerativeType.REASONING:
-                log_once(
-                    f"The model {self.model_config.model_id!r} is a reasoning model "
-                    "and thus does not support structured generation, so we do not "
-                    "enable it.",
-                    level=logging.DEBUG,
-                )
-            else:
-                ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
-                keys_and_their_types: dict[str, t.Any] = {
-                    tag_name: (conlist(str, max_length=5), ...)
-                    for tag_name in ner_tag_names
-                }
-                answer_format_class = create_model(
-                    "AnswerFormat", **keys_and_their_types
-                )
-                structured_generation_schema = answer_format_class.model_json_schema()
-                log_once(
-                    "Using structured generation with the JSON schema "
-                    f"{structured_generation_schema}",
-                    level=logging.DEBUG,
-                )
         # Get the mapping from labels to the first token in the label. We call this each
         # time we generate a new dataset since the dataset config can change
         self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
@@ -382,8 +387,29 @@ class VLLMModel(HuggingFaceEncoderModel):
                 "error was. Skipping this evaluation."
             )
-        # Define the guided decoding that we will use for structured generation
-        if structured_generation_schema is not None:
+        structured_generation_schema = None
+        if (
+            self.dataset_config.task.uses_structured_output
+            or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
+        ) and self.generative_type == GenerativeType.REASONING:
+            guided_decoding = None
+            logger.debug(
+                "The dataset uses structured output, but we are not using it as the "
+                "model is a reasoning model."
+            )
+        elif self.dataset_config.task.uses_structured_output:
+            ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
+            keys_and_their_types: dict[str, t.Any] = {
+                tag_name: (conlist(str, max_length=5), ...)
+                for tag_name in ner_tag_names
+            }
+            answer_format_class = create_model("AnswerFormat", **keys_and_their_types)
+            structured_generation_schema = answer_format_class.model_json_schema()
+            log_once(
+                "Using structured generation with the JSON schema: "
+                f"{json.dumps(structured_generation_schema)}",
+                level=logging.DEBUG,
+            )
             guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
         elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
             guided_decoding = GuidedDecodingParams(
@@ -392,8 +418,17 @@ class VLLMModel(HuggingFaceEncoderModel):
                     for label in self.dataset_config.labels
                 ]
             )
+            log_once(
+                "Using structured generation with the choices: "
+                f"{guided_decoding.choice!r}.",
+                level=logging.DEBUG,
+            )
         else:
             guided_decoding = None
+            log_once(
+                "Not using structured generation as the dataset does not require it.",
+                level=logging.DEBUG,
+            )
         # Define the parameters used for vLLM generation
         max_tokens: int = (
@@ -425,9 +460,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         labels_to_be_generated = list(self.dataset_config.prompt_label_mapping.values())
         if len(labels_to_be_generated) == 0:
             labels_to_be_generated = ["negative", "positive"]
-        if not self.buffer.get(
-            "instruction_model", False
-        ) and should_prompts_be_stripped(
+        if self.generative_type == GenerativeType.BASE and should_prompts_be_stripped(
             labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
         ):
             log_once(
@@ -439,6 +472,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         # Generate sequences using vLLM
         input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
         num_attempts = 3
+        truncation_attempts = 0
         for _ in range(num_attempts):
             try:
                 raw_outputs = self._model.generate(
@@ -466,12 +500,19 @@ class VLLMModel(HuggingFaceEncoderModel):
                         "Prompts are too long, so truncating them and trying again..."
                     )
                     logger.debug(f"The error message was: {str(e)}")
+                    # If we have already tried truncating the prompts a few times, then
+                    # we truncate a bit more aggressively
+                    extra_truncation = 50 * truncation_attempts
+                    truncation_attempts += 1
                     tokenized_prompts = self._tokeniser(
                         text=prompts,
                         truncation=True,
                         max_length=max(
                             min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
-                            - max_tokens,
+                            - max_tokens
+                            - extra_truncation,
                             0,
                         ),
                     )
@@ -577,9 +618,10 @@ class VLLMModel(HuggingFaceEncoderModel):
         if using_api:
             return False
-        model_id, revision = (
-            model_id.split("@") if "@" in model_id else (model_id, "main")
-        )
+        model_id_components = split_model_id(model_id=model_id)
+        model_id = model_id_components.model_id
+        revision = model_id_components.revision
         model_info = get_model_repo_info(
             model_id=model_id, revision=revision, benchmark_config=benchmark_config
         )
@@ -603,11 +645,11 @@ class VLLMModel(HuggingFaceEncoderModel):
         Returns:
             The model configuration.
         """
-        model_id, revision = (
-            model_id.split("@") if "@" in model_id else (model_id, "main")
-        )
+        model_id_components = split_model_id(model_id=model_id)
         model_info = get_model_repo_info(
-            model_id=model_id, revision=revision, benchmark_config=benchmark_config
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            benchmark_config=benchmark_config,
         )
         if model_info is None:
             raise InvalidModel(f"The model {model_id!r} could not be found.")
@@ -616,8 +658,9 @@ class VLLMModel(HuggingFaceEncoderModel):
         language_codes = list(language_mapping.keys())
         model_config = ModelConfig(
-            model_id=model_id,
-            revision=revision,
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            param=model_id_components.param,
             task=model_info.pipeline_tag,
             languages=[
                 language_mapping[tag]
@@ -972,7 +1015,11 @@ def get_end_of_reasoning_token(
     prompt = "What is your name?"
     if has_chat_template(tokeniser=tokeniser):
         templated_prompt = apply_chat_template(
-            conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
+            conversation=[dict(role="user", content=prompt)],
+            tokeniser=tokeniser,
+            tokenise=False,
+            add_generation_prompt=True,
+            enable_thinking=True,
         )
         assert isinstance(templated_prompt, str)
         prompt = templated_prompt
@@ -1050,7 +1097,7 @@ def get_custom_stop_tokens(
     model: "LLM",
     tokeniser: "PreTrainedTokenizer",
     model_id: str,
-    is_reasoning_model: bool,
+    generative_type: GenerativeType | None,
 ) -> list[str]:
     """Get the stop tokens for a generative model.
@@ -1061,9 +1108,8 @@ def get_custom_stop_tokens(
             The tokeniser.
         model_id:
             The model ID.
-        is_reasoning_model:
-            Whether the model is a reasoning model. This is used to determine the number
-            of generated tokens to allow before stopping the generation.
+        generative_type:
+            The generative type of the model.
     Returns:
         A list of stop tokens.
@@ -1073,12 +1119,18 @@ def get_custom_stop_tokens(
     prompt = "Hello"
     if has_chat_template(tokeniser=tokeniser):
         templated_prompt = apply_chat_template(
-            conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
+            conversation=[dict(role="user", content=prompt)],
+            tokeniser=tokeniser,
+            tokenise=False,
+            add_generation_prompt=True,
+            enable_thinking=generative_type == GenerativeType.REASONING,
         )
         assert isinstance(templated_prompt, str)
         prompt = templated_prompt
-    max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
+    max_tokens = (
+        REASONING_MAX_TOKENS if generative_type == GenerativeType.REASONING else 10
+    )
     completion = (
         model.generate(
             prompts=[prompt],

euroeval/benchmarker.py CHANGED Viewed

@@ -19,7 +19,7 @@ from .constants import GENERATIVE_PIPELINE_TAGS
 from .data_loading import load_data
 from .data_models import BenchmarkConfigParams, BenchmarkResult
 from .dataset_configs import get_all_dataset_configs
-from .enums import Device, ModelType
+from .enums import Device, GenerativeType, ModelType
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
 from .finetuning import finetune
 from .generation import generate
@@ -79,6 +79,7 @@ class Benchmarker:
         api_base: str | None = None,
         api_version: str | None = None,
         gpu_memory_utilization: float = 0.9,
+        generative_type: GenerativeType | None = None,
         debug: bool = False,
         run_with_cli: bool = False,
         requires_safetensors: bool = False,
@@ -151,6 +152,10 @@ class Benchmarker:
                 is generative. A larger value will result in faster evaluation, but at
                 the risk of running out of GPU memory. Only reduce this if you are
                 running out of GPU memory. Defaults to 0.9.
+            generative_type:
+                The type of generative model to benchmark. Only relevant if the model is
+                generative. If not specified, then the type will be inferred based on
+                the tags of the model. Defaults to None.
             debug:
                 Whether to output debug information. Defaults to False.
             run_with_cli:
@@ -199,6 +204,7 @@ class Benchmarker:
             api_base=api_base,
             api_version=api_version,
             gpu_memory_utilization=gpu_memory_utilization,
+            generative_type=generative_type,
             debug=debug,
             run_with_cli=run_with_cli,
             requires_safetensors=requires_safetensors,
@@ -438,7 +444,7 @@ class Benchmarker:
                 # Skip if the model type should not be benchmarked on this dataset
                 model_type = model_config.model_type
-                allowed_model_types = dataset_config.task.allowed_model_types
+                allowed_model_types = dataset_config.allowed_model_types
                 if model_type not in allowed_model_types:
                     logger.debug(
                         f"Skipping benchmarking {model_id} on "
@@ -804,6 +810,7 @@ class Benchmarker:
                     scores=scores,
                     model_id=model_config.model_id,
                     model_revision=model_config.revision,
+                    model_param=model_config.param,
                 )
                 record = BenchmarkResult(
@@ -1108,6 +1115,8 @@ def initial_logging(
     model_id = model_config.model_id
     if model_config.revision and model_config.revision != "main":
         model_id += f"@{model_config.revision}"
+    if model_config.param is not None:
+        model_id += f"#{model_config.param}"
     split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
     if model_config.task in GENERATIVE_PIPELINE_TAGS:

euroeval/cli.py CHANGED Viewed

@@ -4,7 +4,7 @@ import click
 from .benchmarker import Benchmarker
 from .dataset_configs import get_all_dataset_configs
-from .enums import Device
+from .enums import Device, GenerativeType
 from .languages import get_all_languages
 from .tasks import get_all_tasks
@@ -208,6 +208,14 @@ from .tasks import get_all_tasks
     help="Only allow loading models that have safetensors weights available",
     default=False,
 )
+@click.option(
+    "--generative-type",
+    type=click.Choice(["base", "instruction_tuned", "reasoning"]),
+    default=None,
+    show_default=True,
+    help="The type of generative model. Only relevant if the model is generative. If "
+    "not specified, the type will be inferred automatically.",
+)
 def benchmark(
     model: tuple[str],
     dataset: tuple[str],
@@ -234,6 +242,7 @@ def benchmark(
     gpu_memory_utilization: float,
     debug: bool,
     requires_safetensors: bool,
+    generative_type: str | None,
 ) -> None:
     """Benchmark pretrained language models on language tasks."""
     models = list(model)
@@ -244,6 +253,9 @@ def benchmark(
     tasks = None if len(task) == 0 else list(task)
     batch_size_int = int(batch_size)
     device = Device[device.upper()] if device is not None else None
+    generative_type_obj = (
+        GenerativeType[generative_type.upper()] if generative_type else None
+    )
     benchmarker = Benchmarker(
         language=languages,
@@ -268,6 +280,7 @@ def benchmark(
         api_base=api_base,
         api_version=api_version,
         gpu_memory_utilization=gpu_memory_utilization,
+        generative_type=generative_type_obj,
         debug=debug,
         run_with_cli=True,
         requires_safetensors=requires_safetensors,

euroeval/constants.py CHANGED Viewed

@@ -15,7 +15,7 @@ MAX_CONTEXT_LENGTH = 8_192
 # We need to raise the amount of tokens generated for reasoning models, to give them
 # time to think
-REASONING_MAX_TOKENS = 32_768
+REASONING_MAX_TOKENS = 8_192
 # The Hugging Face Hub pipeline tags used to classify models as generative
@@ -75,3 +75,9 @@ LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
 # These characters are stripped from JSON output when trying to identify the label
 JSON_STRIP_CHARACTERS = ' {}\n\r":'
+# The number of tokens we generate when evaluating generative models on classification
+# tasks. We also use this to determine whether we should store logprobs in the model
+# outputs (and cache).
+NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10

EuroEval 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.0.0py3-none-any.whl → 16.1.0py3-none-any.whl