PyPI - EuroEval - Versions diffs - 15.7.0__py3-none-any.whl → 15.7.1__py3-none-any.whl - Mend

EuroEval 15.7.0py3-none-any.whl → 15.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (12) hide show

euroeval/benchmark_modules/litellm.py +12 -253
euroeval/benchmark_modules/vllm.py +13 -303
euroeval/benchmarker.py +1 -0
euroeval/data_models.py +3 -1
euroeval/dataset_configs/dutch.py +5 -16
euroeval/generation_utils.py +346 -0
euroeval/scores.py +7 -1
{euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/METADATA +1 -1
{euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/RECORD +12 -11
{euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/entry_points.txt +0 -0
{euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -1,11 +1,8 @@
 """Generative models from an inference API, using the LiteLLM framework."""
 import collections.abc as c
-import itertools as it
-import json
 import logging
 import os
-import random
 import re
 import typing as t
 from functools import cached_property, partial
@@ -60,6 +57,7 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
+from ..generation_utils import apply_prompt, extract_few_shot_examples
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
@@ -943,14 +941,22 @@ class LiteLLMModel(BenchmarkModule):
             )
         if self.benchmark_config.few_shot:
-            few_shot_examples = self._extract_few_shot_examples(
-                dataset=dataset, task=task, itr_idx=itr_idx
+            few_shot_examples = extract_few_shot_examples(
+                dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
             )
         else:
             few_shot_examples = list()
         dataset["test"] = dataset["test"].map(
-            partial(self._apply_prompt, few_shot_examples=few_shot_examples, task=task),
+            partial(
+                apply_prompt,
+                few_shot_examples=few_shot_examples,
+                model_config=self.model_config,
+                dataset_config=self.dataset_config,
+                instruction_model=True,
+                always_populate_text_field=False,
+                tokenizer=None,
+            ),
             batched=True,
             load_from_cache_file=False,
             keep_in_memory=True,
@@ -958,253 +964,6 @@ class LiteLLMModel(BenchmarkModule):
         return dataset
-    def _extract_few_shot_examples(
-        self, dataset: DatasetDict, task: Task, itr_idx: int
-    ) -> list[dict[str, t.Any]]:
-        """Extract few-shot examples from a dataset.
-        This will always extract the examples from the training split.
-        We ensure that the few-shot examples are unique by picking them one at a time.
-        Args:
-            dataset:
-                The dataset to extract the few-shot examples from.
-            task:
-                The task that is being benchmarked.
-            itr_idx:
-                The index of the dataset in the iterator.
-        Returns:
-            The few-shot examples.
-        """
-        random_seed = 4242 + itr_idx
-        num_few_shots = self.dataset_config.num_few_shot_examples
-        few_shot_examples: list[dict[str, t.Any]] = list()
-        shuffled_train = dataset["train"].shuffle(seed=random_seed)
-        match task.task_group:
-            case (
-                TaskGroup.SEQUENCE_CLASSIFICATION
-                | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
-            ):
-                labels = it.cycle(self.dataset_config.labels)
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    label = next(labels)
-                    possible_examples = shuffled_train.filter(
-                        lambda x: x["label"].lower() == label.lower()
-                    )
-                    if len(possible_examples) == 0:
-                        continue
-                    example = possible_examples.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["text"] != example["text"]
-                    )
-            case TaskGroup.TEXT_TO_TEXT:
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    example = shuffled_train.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["text"] != example["text"]
-                    )
-            case TaskGroup.TOKEN_CLASSIFICATION:
-                labels = it.cycle(
-                    [
-                        label.lower()
-                        for label in self.dataset_config.labels
-                        if label.lower().startswith("b-")
-                    ]
-                )
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    label = next(labels)
-                    possible_examples = shuffled_train.filter(
-                        lambda x: label in [tag.lower() for tag in x["labels"]]
-                    )
-                    if len(possible_examples) == 0:
-                        continue
-                    example = possible_examples.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["tokens"] != example["tokens"]
-                    )
-            case TaskGroup.QUESTION_ANSWERING:
-                # Locate the maximum number of tokens that constitutes a short example
-                for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
-                    train_with_short_examples = dataset["train"].filter(
-                        lambda example: len(example["context"]) < max_num_tokens
-                    )
-                    num_short_examples = len(train_with_short_examples)
-                    if num_short_examples >= self.dataset_config.num_few_shot_examples:
-                        break
-                else:
-                    raise InvalidBenchmark(
-                        "Could not find enough short examples for few-shot learning."
-                    )
-                shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    example = shuffled_train.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["context"] != example["context"]
-                    )
-            case _:
-                raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
-        random.seed(random_seed)
-        random.shuffle(few_shot_examples)
-        return few_shot_examples
-    def _apply_prompt(
-        self,
-        examples: dict[str, t.Any],
-        few_shot_examples: list[dict[str, t.Any]],
-        task: Task,
-    ) -> dict[str, t.Any]:
-        """Apply prompt template to an example, potentially with few-shot examples.
-        Args:
-            examples:
-                The examples to apply the few-shot examples to.
-            few_shot_examples:
-                The few-shot examples to apply.
-            task:
-                The task that is being benchmarked.
-        Returns:
-            The example with the few-shot examples applied.
-        """
-        def create_prompt(**kwargs: str) -> tuple[str, str]:
-            """Create a prompt from the given keyword arguments.
-            Args:
-                kwargs:
-                    The keyword arguments to use in the prompt.
-            Returns:
-                A pair (prompt, label), where "label" is an empty string if the model is
-                not instruction tuned (as in this case it is included in the prompt).
-            """
-            label_key = "label" if "label" in kwargs else "target_text"
-            label = kwargs.pop(label_key)
-            label_mapping = self.dataset_config.prompt_label_mapping
-            label = label_mapping.get(label, label)
-            prompt = self.dataset_config.instruction_prompt.format(**kwargs)
-            return prompt, label
-        match task.task_group:
-            case (
-                TaskGroup.SEQUENCE_CLASSIFICATION
-                | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
-            ):
-                few_shot_sections = [
-                    create_prompt(
-                        text=example["text"].replace("\n", " ").strip(),
-                        label=example["label"].replace("\n", " ").strip(),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(text=text.replace("\n", " ").strip(), label="")
-                    for text in examples["text"]
-                ]
-            case TaskGroup.TEXT_TO_TEXT:
-                few_shot_sections = [
-                    create_prompt(
-                        text=example["text"].replace("\n", " ").strip(),
-                        target_text=example["target_text"].replace("\n", " ").strip(),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(text=text.replace("\n", " ").strip(), target_text="")
-                    for text in examples["text"]
-                ]
-            case TaskGroup.TOKEN_CLASSIFICATION:
-                def create_label(example: dict) -> str:
-                    prompt_labels = self.dataset_config.prompt_label_mapping.values()
-                    labels: dict[str, list[str]] = {
-                        prompt_label: list() for prompt_label in prompt_labels
-                    }
-                    for token, label in zip(example["tokens"], example["labels"]):
-                        label = label.lower()
-                        if label == "o":
-                            continue
-                        prompt_label = self.dataset_config.prompt_label_mapping[label]
-                        if label.startswith("b-"):
-                            labels[prompt_label].append(token)
-                        elif label.startswith("i-"):
-                            labels[prompt_label][-1] += " " + token
-                    return json.dumps(labels, ensure_ascii=False)
-                few_shot_sections = [
-                    create_prompt(
-                        text=" ".join(example["tokens"]).replace("\n", " ").strip(),
-                        label=create_label(example=example),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(
-                        text=" ".join(tokens).replace("\n", " ").strip(), label=""
-                    )
-                    for tokens in examples["tokens"]
-                ]
-            case TaskGroup.QUESTION_ANSWERING:
-                few_shot_sections = [
-                    create_prompt(
-                        text=example["context"].replace("\n", " ").strip(),
-                        question=example["question"].replace("\n", " ").strip(),
-                        label=example["answers"]["text"][0].replace("\n", " "),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(
-                        text=context.replace("\n", " ").strip(),
-                        question=question.replace("\n", " ").strip(),
-                        label="",
-                    )
-                    for context, question in zip(
-                        examples["context"], examples["question"]
-                    )
-                ]
-            case _:
-                raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
-        few_shot_messages = [
-            dict(role=role, content=content)
-            for prompt, label in few_shot_sections
-            for role, content in [("user", prompt), ("assistant", label)]
-        ]
-        messages_list = [
-            few_shot_messages + [dict(role="user", content=prompt)]
-            for prompt, _ in new_sections
-        ]
-        examples["messages"] = messages_list
-        return examples
 def raise_if_wrong_params(
     model_config: ModelConfig, allowed_params: dict[str, list[str]]

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -3,11 +3,9 @@
 import collections.abc as c
 import contextlib
 import importlib.util
-import itertools as it
 import json
 import logging
 import os
-import random
 import re
 import sys
 import typing as t
@@ -56,6 +54,7 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
+from ..generation_utils import apply_prompt, extract_few_shot_examples
 from ..languages import get_all_languages
 from ..task_group_utils import (
     question_answering,
@@ -258,14 +257,22 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
         if self.benchmark_config.few_shot:
-            few_shot_examples = self._extract_few_shot_examples(
-                dataset=dataset, task=task, itr_idx=itr_idx
+            few_shot_examples = extract_few_shot_examples(
+                dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
             )
         else:
             few_shot_examples = list()
         dataset["test"] = dataset["test"].map(
-            partial(self._apply_prompt, few_shot_examples=few_shot_examples, task=task),
+            partial(
+                apply_prompt,
+                few_shot_examples=few_shot_examples,
+                model_config=self.model_config,
+                dataset_config=self.dataset_config,
+                instruction_model=self.buffer["instruction_model"],
+                always_populate_text_field=True,
+                tokenizer=self._tokenizer,
+            ),
             batched=True,
             load_from_cache_file=False,
             keep_in_memory=True,
@@ -439,6 +446,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                     logger.info(
                         "Prompts are too long, so truncating them and trying again..."
                     )
+                    logger.debug(f"The error message was: {str(e)}")
                     tokenized_prompts = self._tokenizer(
                         text=prompts,
                         truncation=True,
@@ -499,7 +507,6 @@ class VLLMModel(HuggingFaceEncoderModel):
         # Sanity check
         if len(completions) != len(prompts):
-            breakpoint()
             raise InvalidBenchmark(
                 f"Expected {len(prompts):,} completions, but got {len(completions):,}."
             )
@@ -615,303 +622,6 @@ class VLLMModel(HuggingFaceEncoderModel):
         return model_config
-    def _extract_few_shot_examples(
-        self, dataset: DatasetDict, task: Task, itr_idx: int
-    ) -> list[dict[str, t.Any]]:
-        """Extract few-shot examples from a dataset.
-        This will always extract the examples from the training split.
-        We ensure that the few-shot examples are unique by picking them one at a time.
-        Args:
-            dataset:
-                The dataset to extract the few-shot examples from.
-            task:
-                The task that is being benchmarked.
-            itr_idx:
-                The index of the dataset in the iterator.
-        Returns:
-            The few-shot examples.
-        """
-        random_seed = 4242 + itr_idx
-        num_few_shots = self.dataset_config.num_few_shot_examples
-        few_shot_examples: list[dict[str, t.Any]] = list()
-        shuffled_train = dataset["train"].shuffle(seed=random_seed)
-        match task.task_group:
-            case (
-                TaskGroup.SEQUENCE_CLASSIFICATION
-                | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
-            ):
-                labels = it.cycle(self.dataset_config.labels)
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    label = next(labels)
-                    possible_examples = shuffled_train.filter(
-                        lambda x: x["label"].lower() == label.lower()
-                    )
-                    if len(possible_examples) == 0:
-                        continue
-                    example = possible_examples.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["text"] != example["text"]
-                    )
-            case TaskGroup.TEXT_TO_TEXT:
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    example = shuffled_train.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["text"] != example["text"]
-                    )
-            case TaskGroup.TOKEN_CLASSIFICATION:
-                labels = it.cycle(
-                    [
-                        label.lower()
-                        for label in self.dataset_config.labels
-                        if label.lower().startswith("b-")
-                    ]
-                )
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    label = next(labels)
-                    possible_examples = shuffled_train.filter(
-                        lambda x: label in [tag.lower() for tag in x["labels"]]
-                    )
-                    if len(possible_examples) == 0:
-                        continue
-                    example = possible_examples.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["tokens"] != example["tokens"]
-                    )
-            case TaskGroup.QUESTION_ANSWERING:
-                # Locate the maximum number of tokens that constitutes a short example
-                for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
-                    train_with_short_examples = dataset["train"].filter(
-                        lambda example: len(example["context"]) < max_num_tokens
-                    )
-                    num_short_examples = len(train_with_short_examples)
-                    if num_short_examples >= self.dataset_config.num_few_shot_examples:
-                        break
-                else:
-                    raise InvalidBenchmark(
-                        "Could not find enough short examples for few-shot learning."
-                    )
-                shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    example = shuffled_train.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["context"] != example["context"]
-                    )
-            case _:
-                raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
-        random.seed(random_seed)
-        random.shuffle(few_shot_examples)
-        return few_shot_examples
-    def _apply_prompt(
-        self,
-        examples: dict[str, t.Any],
-        few_shot_examples: list[dict[str, t.Any]],
-        task: Task,
-    ) -> dict[str, t.Any]:
-        """Apply prompt template to an example, potentially with few-shot examples.
-        Args:
-            examples:
-                The examples to apply the few-shot examples to.
-            few_shot_examples:
-                The few-shot examples to apply.
-            task:
-                The task that is being benchmarked.
-        Returns:
-            The example with the few-shot examples applied.
-        """
-        def create_prompt(**kwargs: str) -> tuple[str, str]:
-            """Create a prompt from the given keyword arguments.
-            Args:
-                kwargs:
-                    The keyword arguments to use in the prompt.
-            Returns:
-                A pair (prompt, label), where "label" is an empty string if the model is
-                not instruction tuned (as in this case it is included in the prompt).
-            """
-            label_key = "label" if "label" in kwargs else "target_text"
-            label = kwargs.pop(label_key)
-            assert label is not None, (
-                f"Found a None label for the prompt: {kwargs}. This should not happen."
-            )
-            label_mapping = self.dataset_config.prompt_label_mapping
-            label = label_mapping.get(label, label)
-            if self.buffer["instruction_model"]:
-                prompt = self.dataset_config.instruction_prompt.format(**kwargs)
-                return prompt, label
-            else:
-                kwargs[label_key] = label
-                return self.dataset_config.prompt_template.format(**kwargs), ""
-        match task.task_group:
-            case (
-                TaskGroup.SEQUENCE_CLASSIFICATION
-                | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
-            ):
-                few_shot_sections = [
-                    create_prompt(
-                        text=example["text"].replace("\n", " ").strip(),
-                        label=example["label"].replace("\n", " ").strip(),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(text=text.replace("\n", " ").strip(), label="")
-                    for text in examples["text"]
-                ]
-            case TaskGroup.TEXT_TO_TEXT:
-                few_shot_sections = [
-                    create_prompt(
-                        text=example["text"].replace("\n", " ").strip(),
-                        target_text=example["target_text"].replace("\n", " ").strip(),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(text=text.replace("\n", " ").strip(), target_text="")
-                    for text in examples["text"]
-                ]
-            case TaskGroup.TOKEN_CLASSIFICATION:
-                def create_label(example: dict) -> str:
-                    prompt_labels = self.dataset_config.prompt_label_mapping.values()
-                    labels: dict[str, list[str]] = {
-                        prompt_label: list() for prompt_label in prompt_labels
-                    }
-                    for token, label in zip(example["tokens"], example["labels"]):
-                        label = label.lower()
-                        if label == "o":
-                            continue
-                        prompt_label = self.dataset_config.prompt_label_mapping[label]
-                        if label.startswith("b-"):
-                            labels[prompt_label].append(token)
-                        elif label.startswith("i-"):
-                            labels[prompt_label][-1] += " " + token
-                    return json.dumps(labels, ensure_ascii=False)
-                few_shot_sections = [
-                    create_prompt(
-                        text=" ".join(example["tokens"]).replace("\n", " ").strip(),
-                        label=create_label(example=example),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(
-                        text=" ".join(tokens).replace("\n", " ").strip(), label=""
-                    )
-                    for tokens in examples["tokens"]
-                ]
-            case TaskGroup.QUESTION_ANSWERING:
-                few_shot_sections = [
-                    create_prompt(
-                        text=example["context"].replace("\n", " ").strip(),
-                        question=example["question"].replace("\n", " ").strip(),
-                        label=example["answers"]["text"][0].replace("\n", " "),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(
-                        text=context.replace("\n", " ").strip(),
-                        question=question.replace("\n", " ").strip(),
-                        label="",
-                    )
-                    for context, question in zip(
-                        examples["context"], examples["question"]
-                    )
-                ]
-            case _:
-                raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
-        if self.buffer["instruction_model"]:
-            few_shot_messages = [
-                dict(role=role, content=content)
-                for prompt, label in few_shot_sections
-                for role, content in [("user", prompt), ("assistant", label)]
-            ]
-            messages_list = [
-                few_shot_messages + [dict(role="user", content=prompt)]
-                for prompt, _ in new_sections
-            ]
-            # Pick the chat template that matches the language of the dataset, if such a
-            # template exists
-            chat_template: str | None = None
-            if isinstance(self._tokenizer.chat_template, dict):
-                language_codes = [
-                    language.code for language in self.dataset_config.languages
-                ]
-                for name, candidate_template in self._tokenizer.chat_template.items():
-                    if name.lower() in language_codes:
-                        chat_template = candidate_template
-                        log_once(
-                            f"Using the {name!r} chat template for the tokenizer for "
-                            f"model {self.model_config.model_id!r}.",
-                            level=logging.DEBUG,
-                        )
-                        break
-            texts = [
-                self._tokenizer.apply_chat_template(
-                    conversation=messages,
-                    tokenize=False,
-                    add_generation_prompt=True,
-                    chat_template=chat_template,
-                )
-                for messages in messages_list
-            ]
-            examples["text"] = texts
-        else:
-            prompt_prefix = ""
-            if self.dataset_config.prompt_prefix:
-                prompt_prefix = self.dataset_config.prompt_prefix + "\n\n"
-            few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
-            if few_shot_prompt:
-                few_shot_prompt += "\n\n"
-            examples["text"] = [
-                prompt_prefix + few_shot_prompt + new_prompt
-                for new_prompt, _ in new_sections
-            ]
-        return examples
     @property
     def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.

euroeval/benchmarker.py CHANGED Viewed

@@ -774,6 +774,7 @@ class Benchmarker:
                     metric_configs=dataset_config.task.metrics,
                     scores=scores,
                     model_id=model_config.model_id,
+                    model_revision=model_config.revision,
                 )
                 record = BenchmarkResult(

euroeval/data_models.py CHANGED Viewed

@@ -531,7 +531,9 @@ class DatasetConfig:
         # Convert labels to single-quoted labels - and remove duplicates
         quoted_labels = [
-            f"'{label}'" for label in set(self.prompt_label_mapping.values())
+            f"'{self.prompt_label_mapping[label]}'"
+            for label in set(self.labels)
+            if label in self.prompt_label_mapping
         ]
         if not quoted_labels:

euroeval/dataset_configs/dutch.py CHANGED Viewed

@@ -6,13 +6,14 @@ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
 ### Official datasets ###
-DUTCH_SOCIAL_CONFIG = DatasetConfig(
-    name="dutch-social",
+DBRD_CONFIG = DatasetConfig(
+    name="dbrd",
     pretty_name="the truncated version of the Dutch sentiment classification "
-    "dataset Dutch Social",
-    huggingface_id="EuroEval/dutch-social-mini",
+    "dataset DBRD",
+    huggingface_id="EuroEval/dbrd-mini",
     task=SENT,
     languages=[NL],
+    _labels=["negative", "positive"],
 )
 SCALA_NL_CONFIG = DatasetConfig(
@@ -71,18 +72,6 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
 ### Unofficial datasets ###
-DBRD_CONFIG = DatasetConfig(
-    name="dbrd",
-    pretty_name="the truncated version of the Dutch sentiment classification "
-    "dataset DBRD",
-    huggingface_id="EuroEval/dbrd-mini",
-    task=SENT,
-    languages=[NL],
-    _labels=["negative", "positive"],
-    _prompt_label_mapping=dict(positive="positief", negative="negatief"),
-    unofficial=True,
-)
 DUTCH_COLA_CONFIG = DatasetConfig(
     name="dutch-cola",
     pretty_name="the truncated version of the Dutch linguistic acceptability dataset "

euroeval/generation_utils.py ADDED Viewed

@@ -0,0 +1,346 @@
+"""Utility functions related to generative models."""
+import itertools as it
+import json
+import logging
+import random
+import typing as t
+from .enums import TaskGroup
+from .exceptions import InvalidBenchmark
+from .utils import log_once
+if t.TYPE_CHECKING:
+    from datasets import DatasetDict
+    from transformers.tokenization_utils import PreTrainedTokenizer
+    from .data_models import DatasetConfig, ModelConfig
+logger = logging.getLogger("euroeval")
+def extract_few_shot_examples(
+    dataset: "DatasetDict", dataset_config: "DatasetConfig", itr_idx: int
+) -> list[dict[str, t.Any]]:
+    """Extract few-shot examples from a dataset.
+    This will always extract the examples from the training split.
+    We ensure that the few-shot examples are unique by picking them one at a time.
+    Args:
+        dataset:
+            The dataset to extract the few-shot examples from.
+        dataset_config:
+            The dataset configuration.
+        itr_idx:
+            The index of the dataset in the iterator.
+    Returns:
+        The few-shot examples.
+    """
+    random_seed = 4242 + itr_idx
+    num_few_shots = dataset_config.num_few_shot_examples
+    few_shot_examples: list[dict[str, t.Any]] = list()
+    shuffled_train = dataset["train"].shuffle(seed=random_seed)
+    match dataset_config.task.task_group:
+        case (
+            TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
+        ):
+            # Locate the maximum number of tokens that constitutes a short example
+            for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
+                train_with_short_examples = dataset["train"].filter(
+                    lambda example: len(example["text"]) < max_num_tokens
+                )
+                num_short_examples = len(train_with_short_examples)
+                if num_short_examples >= dataset_config.num_few_shot_examples:
+                    break
+            else:
+                raise InvalidBenchmark(
+                    "Could not find enough short examples for few-shot learning."
+                )
+            shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
+            labels = it.cycle(dataset_config.labels)
+            while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
+                label = next(labels)
+                possible_examples = shuffled_train.filter(
+                    lambda x: x["label"].lower() == label.lower()
+                )
+                if len(possible_examples) == 0:
+                    continue
+                example = possible_examples.select(range(1))[0]
+                few_shot_examples.append(example)
+                shuffled_train = shuffled_train.filter(
+                    lambda x: x["text"] != example["text"]
+                )
+        case TaskGroup.TEXT_TO_TEXT:
+            while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
+                example = shuffled_train.select(range(1))[0]
+                few_shot_examples.append(example)
+                shuffled_train = shuffled_train.filter(
+                    lambda x: x["text"] != example["text"]
+                )
+        case TaskGroup.TOKEN_CLASSIFICATION:
+            labels = it.cycle(
+                [
+                    label.lower()
+                    for label in dataset_config.labels
+                    if label.lower().startswith("b-")
+                ]
+            )
+            while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
+                label = next(labels)
+                possible_examples = shuffled_train.filter(
+                    lambda x: label in [tag.lower() for tag in x["labels"]]
+                )
+                if len(possible_examples) == 0:
+                    continue
+                example = possible_examples.select(range(1))[0]
+                few_shot_examples.append(example)
+                shuffled_train = shuffled_train.filter(
+                    lambda x: x["tokens"] != example["tokens"]
+                )
+        case TaskGroup.QUESTION_ANSWERING:
+            # Locate the maximum number of tokens that constitutes a short example
+            for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
+                train_with_short_examples = dataset["train"].filter(
+                    lambda example: len(example["context"]) < max_num_tokens
+                )
+                num_short_examples = len(train_with_short_examples)
+                if num_short_examples >= dataset_config.num_few_shot_examples:
+                    break
+            else:
+                raise InvalidBenchmark(
+                    "Could not find enough short examples for few-shot learning."
+                )
+            shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
+            while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
+                example = shuffled_train.select(range(1))[0]
+                few_shot_examples.append(example)
+                shuffled_train = shuffled_train.filter(
+                    lambda x: x["context"] != example["context"]
+                )
+        case _:
+            raise NotImplementedError(
+                f"Unsupported task group: {dataset_config.task.task_group}."
+            )
+    random.seed(random_seed)
+    random.shuffle(few_shot_examples)
+    return few_shot_examples
+def apply_prompt(
+    examples: dict[str, t.Any],
+    few_shot_examples: list[dict[str, t.Any]],
+    model_config: "ModelConfig",
+    dataset_config: "DatasetConfig",
+    instruction_model: bool,
+    always_populate_text_field: bool,
+    tokenizer: "PreTrainedTokenizer | None",
+) -> dict[str, t.Any]:
+    """Apply prompt template to an example, potentially with few-shot examples.
+    Args:
+        examples:
+            The examples to apply the few-shot examples to.
+        few_shot_examples:
+            The few-shot examples to apply.
+        dataset_config:
+            The dataset configuration.
+        instruction_model:
+            Whether the model is instruction-tuned.
+        always_populate_text_field:
+            Whether to always populate the 'text' field in the examples, as opposed to
+            the 'messages' field.
+        tokenizer:
+            The tokenizer to use for the model. If None, the tokenizer is not used.
+    Returns:
+        The example with the few-shot examples applied.
+    """
+    # Sanity check
+    if instruction_model and always_populate_text_field and tokenizer is None:
+        raise ValueError(
+            "The `tokenizer` argument must be provided when the model is instruction "
+            "tuned and when we are not just returning the raw messages."
+        )
+    def create_prompt(**kwargs: str) -> tuple[str, str]:
+        """Create a prompt from the given keyword arguments.
+        Args:
+            kwargs:
+                The keyword arguments to use in the prompt.
+        Returns:
+            A pair (prompt, label), where "label" is an empty string if the model is
+            not instruction tuned (as in this case it is included in the prompt).
+        """
+        label_key = "label" if "label" in kwargs else "target_text"
+        label = kwargs.pop(label_key)
+        assert label is not None, (
+            f"Found a None label for the prompt: {kwargs}. This should not happen."
+        )
+        label_mapping = dataset_config.prompt_label_mapping
+        label = label_mapping.get(label, label)
+        if instruction_model:
+            prompt = dataset_config.instruction_prompt.format(**kwargs)
+            return prompt, label
+        else:
+            kwargs[label_key] = label
+            return dataset_config.prompt_template.format(**kwargs), ""
+    match dataset_config.task.task_group:
+        case (
+            TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
+        ):
+            few_shot_sections = [
+                create_prompt(
+                    text=example["text"].replace("\n", " ").strip(),
+                    label=example["label"].replace("\n", " ").strip(),
+                )
+                for example in few_shot_examples
+            ]
+            new_sections = [
+                create_prompt(text=text.replace("\n", " ").strip(), label="")
+                for text in examples["text"]
+            ]
+        case TaskGroup.TEXT_TO_TEXT:
+            few_shot_sections = [
+                create_prompt(
+                    text=example["text"].replace("\n", " ").strip(),
+                    target_text=example["target_text"].replace("\n", " ").strip(),
+                )
+                for example in few_shot_examples
+            ]
+            new_sections = [
+                create_prompt(text=text.replace("\n", " ").strip(), target_text="")
+                for text in examples["text"]
+            ]
+        case TaskGroup.TOKEN_CLASSIFICATION:
+            def create_label(example: dict) -> str:
+                prompt_labels = dataset_config.prompt_label_mapping.values()
+                labels: dict[str, list[str]] = {
+                    prompt_label: list() for prompt_label in prompt_labels
+                }
+                for token, label in zip(example["tokens"], example["labels"]):
+                    label = label.lower()
+                    if label == "o":
+                        continue
+                    prompt_label = dataset_config.prompt_label_mapping[label]
+                    if label.startswith("b-"):
+                        labels[prompt_label].append(token)
+                    elif label.startswith("i-"):
+                        labels[prompt_label][-1] += " " + token
+                return json.dumps(labels, ensure_ascii=False)
+            few_shot_sections = [
+                create_prompt(
+                    text=" ".join(example["tokens"]).replace("\n", " ").strip(),
+                    label=create_label(example=example),
+                )
+                for example in few_shot_examples
+            ]
+            new_sections = [
+                create_prompt(
+                    text=" ".join(tokens).replace("\n", " ").strip(), label=""
+                )
+                for tokens in examples["tokens"]
+            ]
+        case TaskGroup.QUESTION_ANSWERING:
+            few_shot_sections = [
+                create_prompt(
+                    text=example["context"].replace("\n", " ").strip(),
+                    question=example["question"].replace("\n", " ").strip(),
+                    label=example["answers"]["text"][0].replace("\n", " "),
+                )
+                for example in few_shot_examples
+            ]
+            new_sections = [
+                create_prompt(
+                    text=context.replace("\n", " ").strip(),
+                    question=question.replace("\n", " ").strip(),
+                    label="",
+                )
+                for context, question in zip(examples["context"], examples["question"])
+            ]
+        case _:
+            raise NotImplementedError(
+                f"Unsupported task group: {dataset_config.task.task_group}."
+            )
+    if instruction_model:
+        few_shot_messages = [
+            dict(role=role, content=content)
+            for prompt, label in few_shot_sections
+            for role, content in [("user", prompt), ("assistant", label)]
+        ]
+        messages_list = [
+            few_shot_messages + [dict(role="user", content=prompt)]
+            for prompt, _ in new_sections
+        ]
+        if not always_populate_text_field:
+            examples["messages"] = messages_list
+        else:
+            assert tokenizer is not None
+            # Pick the chat template that matches the language of the dataset, if such a
+            # template exists
+            chat_template: str | None = None
+            if isinstance(tokenizer.chat_template, dict):
+                language_codes = [
+                    language.code for language in dataset_config.languages
+                ]
+                for name, candidate_template in tokenizer.chat_template.items():
+                    if name.lower() in language_codes:
+                        chat_template = candidate_template
+                        log_once(
+                            f"Using the {name!r} chat template for the tokenizer for "
+                            f"model {model_config.model_id!r}.",
+                            level=logging.DEBUG,
+                        )
+                        break
+            texts = [
+                tokenizer.apply_chat_template(
+                    conversation=messages,
+                    tokenize=False,
+                    add_generation_prompt=True,
+                    chat_template=chat_template,
+                )
+                for messages in messages_list
+            ]
+            examples["text"] = texts
+    else:
+        prompt_prefix = ""
+        if dataset_config.prompt_prefix:
+            prompt_prefix = dataset_config.prompt_prefix + "\n\n"
+        few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
+        if few_shot_prompt:
+            few_shot_prompt += "\n\n"
+        examples["text"] = [
+            prompt_prefix + few_shot_prompt + new_prompt
+            for new_prompt, _ in new_sections
+        ]
+    return examples

euroeval/scores.py CHANGED Viewed

@@ -18,6 +18,7 @@ def log_scores(
     metric_configs: list["MetricConfig"],
     scores: list[dict[str, float]],
     model_id: str,
+    model_revision: str,
 ) -> "ScoreDict":
     """Log the scores.
@@ -30,13 +31,18 @@ def log_scores(
             The scores that are to be logged. This is a list of dictionaries full of
             scores.
         model_id:
-            The full Hugging Face Hub path to the pretrained transformer model.
+            The model ID of the model that was evaluated.
+        model_revision:
+            The revision of the model.
     Returns:
         A dictionary with keys 'raw_scores' and 'total', with 'raw_scores' being
         identical to `scores` and 'total' being a dictionary with the aggregated scores
         (means and standard errors).
     """
+    if model_revision and model_revision != "main":
+        model_id += f"@{model_revision}"
     logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
     total_dict: dict[str, float] = dict()

{euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.7.0
+Version: 15.7.1
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues

{euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/RECORD RENAMED Viewed

@@ -1,21 +1,22 @@
 euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
 euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
-euroeval/benchmarker.py,sha256=gOLNpW11cBX_8AvotnlGNbejtOM4acmXS3aovNREqhA,48434
+euroeval/benchmarker.py,sha256=OnjGVblWW20wSmA7Tr2c-qE3g8FIjxW6wTJySAcGxVk,48492
 euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
 euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
 euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
 euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
-euroeval/data_models.py,sha256=Nlb2s26u5OvQ2AITAt25NMpeI1IHM2_qqbpyU_bZhiY,22907
+euroeval/data_models.py,sha256=t5FwpGxiSIMe7iKae-tT7usUWki-ILzAFFm7dPJoFsk,22973
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
 euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
 euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
 euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
+euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
 euroeval/human_evaluation.py,sha256=VGvw1X6Mkdf22r-THSNWXMIqyJP44yh4rW53vq-0huo,27681
 euroeval/languages.py,sha256=IQUbGMyn7pxAyM70M0FTO80m92Q4KgIU604MJhVia-Q,8513
 euroeval/model_cache.py,sha256=n39yFpZkudBCVwz1EQpZ-g5BQtlQemQ5nP3IiFKJZHg,8275
 euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
 euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
-euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
+euroeval/scores.py,sha256=TovjCZD8wmGrIjA4v5oAQp18P5KVcHvakkByDh0Hstk,3059
 euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
 euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
 euroeval/tokenization_utils.py,sha256=fbMVAOkRdcpf9L2SVechPpmWYgDXgQcc-sDrYu21wFI,12487
@@ -25,11 +26,11 @@ euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwR
 euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
 euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
 euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
-euroeval/benchmark_modules/litellm.py,sha256=9Fhh7Zyn6F4JBlRoQkST1wIeb8z0YliRRrcmD5pONs4,52551
-euroeval/benchmark_modules/vllm.py,sha256=vwAE7SGRhePqkzAt1S-FKPelEqe8VMGwah9Nj2J1hLs,51295
+euroeval/benchmark_modules/litellm.py,sha256=v_rbCm2FiTMqcUui_09k3E1-s5uOmbfAvSy2c7Mm0_E,42636
+euroeval/benchmark_modules/vllm.py,sha256=Q-3vtZz5XxQQImJxOiF0XDrQ4T_p0bkgdPw1Jobgu3s,39380
 euroeval/dataset_configs/__init__.py,sha256=fkD1hzW7szJLc1MdK-AY4EBFWBUX5Z8t4f9uBHQnRvU,1858
 euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
-euroeval/dataset_configs/dutch.py,sha256=N3zL0vGe4OyPgVU_AiYNNfk96jSc_JDtKrVIHbaEYCU,3536
+euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
 euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
 euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
 euroeval/dataset_configs/finnish.py,sha256=BIfcxdgJu4CfevHAjzwH7cYd8Xd9DGrm49lcJZcGVQM,1730
@@ -53,8 +54,8 @@ euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iY
 euroeval/task_group_utils/sequence_classification.py,sha256=xPz1gJioK96iv2bNoDWiC2EJkhRvRd7QZNgY8bT237c,11703
 euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
 euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
-euroeval-15.7.0.dist-info/METADATA,sha256=8oMsbhHWeO7j4KQdn4lpt-O94Nw0erwRoD_Ogk6CX2U,13669
-euroeval-15.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.7.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.7.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
-euroeval-15.7.0.dist-info/RECORD,,
+euroeval-15.7.1.dist-info/METADATA,sha256=Fj6QejwQCK0zGuP_DHSQ7sul195ivUqOUCT5AVxgLSI,13669
+euroeval-15.7.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.7.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.7.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
+euroeval-15.7.1.dist-info/RECORD,,

{euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.7.0__py3-none-any.whl → 15.7.1__py3-none-any.whl

Potentially problematic release.

EuroEval 15.7.0py3-none-any.whl → 15.7.1py3-none-any.whl