PyPI - EuroEval - Versions diffs - 15.7.0__py3-none-any.whl → 15.7.2__py3-none-any.whl - Mend

EuroEval 15.7.0py3-none-any.whl → 15.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (18) hide show

euroeval/benchmark_config_factory.py +1 -1
euroeval/benchmark_modules/litellm.py +27 -258
euroeval/benchmark_modules/vllm.py +14 -304
euroeval/benchmarker.py +14 -11
euroeval/data_models.py +3 -1
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/dutch.py +5 -16
euroeval/dataset_configs/finnish.py +11 -9
euroeval/generation_utils.py +346 -0
euroeval/languages.py +1 -1
euroeval/scores.py +7 -1
euroeval/task_group_utils/sequence_classification.py +46 -11
euroeval/tokenization_utils.py +50 -14
{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/METADATA +1 -1
{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/RECORD +18 -17
{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/WHEEL +0 -0
{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/entry_points.txt +0 -0
{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -3,11 +3,9 @@
 import collections.abc as c
 import contextlib
 import importlib.util
-import itertools as it
 import json
 import logging
 import os
-import random
 import re
 import sys
 import typing as t
@@ -56,6 +54,7 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
+from ..generation_utils import apply_prompt, extract_few_shot_examples
 from ..languages import get_all_languages
 from ..task_group_utils import (
     question_answering,
@@ -258,14 +257,22 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
         if self.benchmark_config.few_shot:
-            few_shot_examples = self._extract_few_shot_examples(
-                dataset=dataset, task=task, itr_idx=itr_idx
+            few_shot_examples = extract_few_shot_examples(
+                dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
             )
         else:
             few_shot_examples = list()
         dataset["test"] = dataset["test"].map(
-            partial(self._apply_prompt, few_shot_examples=few_shot_examples, task=task),
+            partial(
+                apply_prompt,
+                few_shot_examples=few_shot_examples,
+                model_config=self.model_config,
+                dataset_config=self.dataset_config,
+                instruction_model=self.buffer["instruction_model"],
+                always_populate_text_field=True,
+                tokenizer=self._tokenizer,
+            ),
             batched=True,
             load_from_cache_file=False,
             keep_in_memory=True,
@@ -439,6 +446,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                     logger.info(
                         "Prompts are too long, so truncating them and trying again..."
                     )
+                    logger.debug(f"The error message was: {str(e)}")
                     tokenized_prompts = self._tokenizer(
                         text=prompts,
                         truncation=True,
@@ -499,7 +507,6 @@ class VLLMModel(HuggingFaceEncoderModel):
         # Sanity check
         if len(completions) != len(prompts):
-            breakpoint()
             raise InvalidBenchmark(
                 f"Expected {len(prompts):,} completions, but got {len(completions):,}."
             )
@@ -615,303 +622,6 @@ class VLLMModel(HuggingFaceEncoderModel):
         return model_config
-    def _extract_few_shot_examples(
-        self, dataset: DatasetDict, task: Task, itr_idx: int
-    ) -> list[dict[str, t.Any]]:
-        """Extract few-shot examples from a dataset.
-        This will always extract the examples from the training split.
-        We ensure that the few-shot examples are unique by picking them one at a time.
-        Args:
-            dataset:
-                The dataset to extract the few-shot examples from.
-            task:
-                The task that is being benchmarked.
-            itr_idx:
-                The index of the dataset in the iterator.
-        Returns:
-            The few-shot examples.
-        """
-        random_seed = 4242 + itr_idx
-        num_few_shots = self.dataset_config.num_few_shot_examples
-        few_shot_examples: list[dict[str, t.Any]] = list()
-        shuffled_train = dataset["train"].shuffle(seed=random_seed)
-        match task.task_group:
-            case (
-                TaskGroup.SEQUENCE_CLASSIFICATION
-                | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
-            ):
-                labels = it.cycle(self.dataset_config.labels)
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    label = next(labels)
-                    possible_examples = shuffled_train.filter(
-                        lambda x: x["label"].lower() == label.lower()
-                    )
-                    if len(possible_examples) == 0:
-                        continue
-                    example = possible_examples.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["text"] != example["text"]
-                    )
-            case TaskGroup.TEXT_TO_TEXT:
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    example = shuffled_train.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["text"] != example["text"]
-                    )
-            case TaskGroup.TOKEN_CLASSIFICATION:
-                labels = it.cycle(
-                    [
-                        label.lower()
-                        for label in self.dataset_config.labels
-                        if label.lower().startswith("b-")
-                    ]
-                )
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    label = next(labels)
-                    possible_examples = shuffled_train.filter(
-                        lambda x: label in [tag.lower() for tag in x["labels"]]
-                    )
-                    if len(possible_examples) == 0:
-                        continue
-                    example = possible_examples.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["tokens"] != example["tokens"]
-                    )
-            case TaskGroup.QUESTION_ANSWERING:
-                # Locate the maximum number of tokens that constitutes a short example
-                for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
-                    train_with_short_examples = dataset["train"].filter(
-                        lambda example: len(example["context"]) < max_num_tokens
-                    )
-                    num_short_examples = len(train_with_short_examples)
-                    if num_short_examples >= self.dataset_config.num_few_shot_examples:
-                        break
-                else:
-                    raise InvalidBenchmark(
-                        "Could not find enough short examples for few-shot learning."
-                    )
-                shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    example = shuffled_train.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["context"] != example["context"]
-                    )
-            case _:
-                raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
-        random.seed(random_seed)
-        random.shuffle(few_shot_examples)
-        return few_shot_examples
-    def _apply_prompt(
-        self,
-        examples: dict[str, t.Any],
-        few_shot_examples: list[dict[str, t.Any]],
-        task: Task,
-    ) -> dict[str, t.Any]:
-        """Apply prompt template to an example, potentially with few-shot examples.
-        Args:
-            examples:
-                The examples to apply the few-shot examples to.
-            few_shot_examples:
-                The few-shot examples to apply.
-            task:
-                The task that is being benchmarked.
-        Returns:
-            The example with the few-shot examples applied.
-        """
-        def create_prompt(**kwargs: str) -> tuple[str, str]:
-            """Create a prompt from the given keyword arguments.
-            Args:
-                kwargs:
-                    The keyword arguments to use in the prompt.
-            Returns:
-                A pair (prompt, label), where "label" is an empty string if the model is
-                not instruction tuned (as in this case it is included in the prompt).
-            """
-            label_key = "label" if "label" in kwargs else "target_text"
-            label = kwargs.pop(label_key)
-            assert label is not None, (
-                f"Found a None label for the prompt: {kwargs}. This should not happen."
-            )
-            label_mapping = self.dataset_config.prompt_label_mapping
-            label = label_mapping.get(label, label)
-            if self.buffer["instruction_model"]:
-                prompt = self.dataset_config.instruction_prompt.format(**kwargs)
-                return prompt, label
-            else:
-                kwargs[label_key] = label
-                return self.dataset_config.prompt_template.format(**kwargs), ""
-        match task.task_group:
-            case (
-                TaskGroup.SEQUENCE_CLASSIFICATION
-                | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
-            ):
-                few_shot_sections = [
-                    create_prompt(
-                        text=example["text"].replace("\n", " ").strip(),
-                        label=example["label"].replace("\n", " ").strip(),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(text=text.replace("\n", " ").strip(), label="")
-                    for text in examples["text"]
-                ]
-            case TaskGroup.TEXT_TO_TEXT:
-                few_shot_sections = [
-                    create_prompt(
-                        text=example["text"].replace("\n", " ").strip(),
-                        target_text=example["target_text"].replace("\n", " ").strip(),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(text=text.replace("\n", " ").strip(), target_text="")
-                    for text in examples["text"]
-                ]
-            case TaskGroup.TOKEN_CLASSIFICATION:
-                def create_label(example: dict) -> str:
-                    prompt_labels = self.dataset_config.prompt_label_mapping.values()
-                    labels: dict[str, list[str]] = {
-                        prompt_label: list() for prompt_label in prompt_labels
-                    }
-                    for token, label in zip(example["tokens"], example["labels"]):
-                        label = label.lower()
-                        if label == "o":
-                            continue
-                        prompt_label = self.dataset_config.prompt_label_mapping[label]
-                        if label.startswith("b-"):
-                            labels[prompt_label].append(token)
-                        elif label.startswith("i-"):
-                            labels[prompt_label][-1] += " " + token
-                    return json.dumps(labels, ensure_ascii=False)
-                few_shot_sections = [
-                    create_prompt(
-                        text=" ".join(example["tokens"]).replace("\n", " ").strip(),
-                        label=create_label(example=example),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(
-                        text=" ".join(tokens).replace("\n", " ").strip(), label=""
-                    )
-                    for tokens in examples["tokens"]
-                ]
-            case TaskGroup.QUESTION_ANSWERING:
-                few_shot_sections = [
-                    create_prompt(
-                        text=example["context"].replace("\n", " ").strip(),
-                        question=example["question"].replace("\n", " ").strip(),
-                        label=example["answers"]["text"][0].replace("\n", " "),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(
-                        text=context.replace("\n", " ").strip(),
-                        question=question.replace("\n", " ").strip(),
-                        label="",
-                    )
-                    for context, question in zip(
-                        examples["context"], examples["question"]
-                    )
-                ]
-            case _:
-                raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
-        if self.buffer["instruction_model"]:
-            few_shot_messages = [
-                dict(role=role, content=content)
-                for prompt, label in few_shot_sections
-                for role, content in [("user", prompt), ("assistant", label)]
-            ]
-            messages_list = [
-                few_shot_messages + [dict(role="user", content=prompt)]
-                for prompt, _ in new_sections
-            ]
-            # Pick the chat template that matches the language of the dataset, if such a
-            # template exists
-            chat_template: str | None = None
-            if isinstance(self._tokenizer.chat_template, dict):
-                language_codes = [
-                    language.code for language in self.dataset_config.languages
-                ]
-                for name, candidate_template in self._tokenizer.chat_template.items():
-                    if name.lower() in language_codes:
-                        chat_template = candidate_template
-                        log_once(
-                            f"Using the {name!r} chat template for the tokenizer for "
-                            f"model {self.model_config.model_id!r}.",
-                            level=logging.DEBUG,
-                        )
-                        break
-            texts = [
-                self._tokenizer.apply_chat_template(
-                    conversation=messages,
-                    tokenize=False,
-                    add_generation_prompt=True,
-                    chat_template=chat_template,
-                )
-                for messages in messages_list
-            ]
-            examples["text"] = texts
-        else:
-            prompt_prefix = ""
-            if self.dataset_config.prompt_prefix:
-                prompt_prefix = self.dataset_config.prompt_prefix + "\n\n"
-            few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
-            if few_shot_prompt:
-                few_shot_prompt += "\n\n"
-            examples["text"] = [
-                prompt_prefix + few_shot_prompt + new_prompt
-                for new_prompt, _ in new_sections
-            ]
-        return examples
     @property
     def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.
@@ -1087,7 +797,7 @@ def load_model_and_tokenizer(
             enable_lora=model_config.adapter_base_model_id is not None,
             max_lora_rank=256,
         )
-    except (ValueError, OSError) as e:
+    except (RuntimeError, ValueError, OSError) as e:
         if "awaiting a review from the repo authors" in str(e):
             raise InvalidModel(
                 f"The model {model_id!r} is awaiting a review from the repository "

euroeval/benchmarker.py CHANGED Viewed

@@ -372,15 +372,7 @@ class Benchmarker:
         current_benchmark_results: list[BenchmarkResult] = list()
         for model_id in model_ids:
-            try:
-                model_config = get_model_config(
-                    model_id=model_id, benchmark_config=benchmark_config
-                )
-            except InvalidModel as e:
-                logger.info(e.message)
-                num_finished_benchmarks += len(dataset_configs)
-                continue
+            model_config: ModelConfig | None = None
             loaded_model: BenchmarkModule | None = None
             for dataset_config in dataset_configs:
                 # Skip if we have already benchmarked this model on this dataset and
@@ -394,12 +386,22 @@ class Benchmarker:
                 ):
                     logger.debug(
                         f"Skipping benchmarking {model_id} on "
-                        f"{dataset_config.pretty_name}, as it "
-                        "has already been benchmarked."
+                        f"{dataset_config.pretty_name}, as it has already been "
+                        "benchmarked."
                     )
                     num_finished_benchmarks += 1
                     continue
+                if model_config is None:
+                    try:
+                        model_config = get_model_config(
+                            model_id=model_id, benchmark_config=benchmark_config
+                        )
+                    except InvalidModel as e:
+                        logger.info(e.message)
+                        num_finished_benchmarks += len(dataset_configs)
+                        continue
                 # Skip if the model is an encoder model and the task is generative
                 task_is_generative = (
                     dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
@@ -774,6 +776,7 @@ class Benchmarker:
                     metric_configs=dataset_config.task.metrics,
                     scores=scores,
                     model_id=model_config.model_id,
+                    model_revision=model_config.revision,
                 )
                 record = BenchmarkResult(

euroeval/data_models.py CHANGED Viewed

@@ -531,7 +531,9 @@ class DatasetConfig:
         # Convert labels to single-quoted labels - and remove duplicates
         quoted_labels = [
-            f"'{label}'" for label in set(self.prompt_label_mapping.values())
+            f"'{self.prompt_label_mapping[label]}'"
+            for label in set(self.labels)
+            if label in self.prompt_label_mapping
         ]
         if not quoted_labels:

euroeval/dataset_configs/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .danish import *  # noqa: F403
 from .dutch import *  # noqa: F403
 from .english import *  # noqa: F403
 from .faroese import *  # noqa: F403
+from .finnish import *  # noqa: F403
 from .french import *  # noqa: F403
 from .german import *  # noqa: F403
 from .icelandic import *  # noqa: F403

euroeval/dataset_configs/dutch.py CHANGED Viewed

@@ -6,13 +6,14 @@ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
 ### Official datasets ###
-DUTCH_SOCIAL_CONFIG = DatasetConfig(
-    name="dutch-social",
+DBRD_CONFIG = DatasetConfig(
+    name="dbrd",
     pretty_name="the truncated version of the Dutch sentiment classification "
-    "dataset Dutch Social",
-    huggingface_id="EuroEval/dutch-social-mini",
+    "dataset DBRD",
+    huggingface_id="EuroEval/dbrd-mini",
     task=SENT,
     languages=[NL],
+    _labels=["negative", "positive"],
 )
 SCALA_NL_CONFIG = DatasetConfig(
@@ -71,18 +72,6 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
 ### Unofficial datasets ###
-DBRD_CONFIG = DatasetConfig(
-    name="dbrd",
-    pretty_name="the truncated version of the Dutch sentiment classification "
-    "dataset DBRD",
-    huggingface_id="EuroEval/dbrd-mini",
-    task=SENT,
-    languages=[NL],
-    _labels=["negative", "positive"],
-    _prompt_label_mapping=dict(positive="positief", negative="negatief"),
-    unofficial=True,
-)
 DUTCH_COLA_CONFIG = DatasetConfig(
     name="dutch-cola",
     pretty_name="the truncated version of the Dutch linguistic acceptability dataset "

euroeval/dataset_configs/finnish.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from ..data_models import DatasetConfig
 from ..languages import FI
-from ..tasks import COMMON_SENSE, LA, NER, RC, SENT, SUMM
+from ..tasks import LA, NER, RC, SENT, SUMM
 ### Official datasets ###
@@ -40,14 +40,16 @@ XLSUM_FI_CONFIG = DatasetConfig(
     languages=[FI],
 )
-HELLASWAG_FI_CONFIG = DatasetConfig(
-    name="hellaswag-fi",
-    pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
-    "HellaSwag-fi, translated from the English HellaSwag dataset",
-    huggingface_id="EuroEval/hellaswag-fi-mini",
-    task=COMMON_SENSE,
-    languages=[FI],
-)
+# TODO: Include when this issue has been resolved:
+# https://github.com/EuroEval/EuroEval/issues/158#issuecomment-2846664885
+# HELLASWAG_FI_CONFIG = DatasetConfig(
+#     name="hellaswag-fi",
+#     pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
+#     "HellaSwag-fi, translated from the English HellaSwag dataset",
+#     huggingface_id="EuroEval/hellaswag-fi-mini",
+#     task=COMMON_SENSE,
+#     languages=[FI],
+# )
 SCALA_FI_CONFIG = DatasetConfig(
     name="scala-fi",

EuroEval 15.7.0__py3-none-any.whl → 15.7.2__py3-none-any.whl

Potentially problematic release.

EuroEval 15.7.0py3-none-any.whl → 15.7.2py3-none-any.whl