PyPI - EuroEval - Versions diffs - 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl - Mend

EuroEval 15.4.2py3-none-any.whl → 15.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (54) hide show

euroeval/__init__.py +2 -2
euroeval/benchmark_modules/base.py +3 -2
euroeval/benchmark_modules/fresh.py +8 -6
euroeval/benchmark_modules/hf.py +44 -33
euroeval/benchmark_modules/litellm.py +314 -120
euroeval/benchmark_modules/vllm.py +99 -59
euroeval/benchmarker.py +52 -21
euroeval/callbacks.py +2 -2
euroeval/constants.py +9 -2
euroeval/data_models.py +258 -44
euroeval/dataset_configs/__init__.py +61 -0
euroeval/dataset_configs/danish.py +120 -0
euroeval/dataset_configs/dutch.py +123 -0
euroeval/dataset_configs/english.py +88 -0
euroeval/dataset_configs/faroese.py +53 -0
euroeval/dataset_configs/french.py +83 -0
euroeval/dataset_configs/german.py +91 -0
euroeval/dataset_configs/icelandic.py +148 -0
euroeval/dataset_configs/italian.py +81 -0
euroeval/dataset_configs/norwegian.py +178 -0
euroeval/dataset_configs/spanish.py +78 -0
euroeval/dataset_configs/swedish.py +100 -0
euroeval/exceptions.py +10 -10
euroeval/finetuning.py +6 -10
euroeval/generation.py +1 -0
euroeval/human_evaluation.py +2 -2
euroeval/languages.py +20 -13
euroeval/model_cache.py +1 -1
euroeval/model_loading.py +1 -12
euroeval/prompt_templates/__init__.py +8 -0
euroeval/prompt_templates/linguistic_acceptability.py +112 -0
euroeval/prompt_templates/multiple_choice.py +97 -0
euroeval/prompt_templates/named_entity_recognition.py +257 -0
euroeval/prompt_templates/reading_comprehension.py +118 -0
euroeval/prompt_templates/sentiment_classification.py +137 -0
euroeval/prompt_templates/summarization.py +97 -0
euroeval/speed_benchmark.py +1 -1
euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
euroeval/tasks.py +54 -0
euroeval/tokenization_utils.py +343 -0
euroeval/types.py +3 -1
euroeval/utils.py +5 -254
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
euroeval-15.6.0.dist-info/RECORD +59 -0
euroeval/dataset_configs.py +0 -2408
euroeval-15.4.2.dist-info/RECORD +0 -40
/euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0

euroeval/tasks.py CHANGED Viewed

@@ -2,6 +2,14 @@
 from .data_models import MetricConfig, Task
 from .enums import TaskGroup
+from .prompt_templates import (
+    LA_TEMPLATES,
+    MULTIPLE_CHOICE_TEMPLATES,
+    NER_TEMPLATES,
+    RC_TEMPLATES,
+    SENT_TEMPLATES,
+    SUMM_TEMPLATES,
+)
 def get_all_tasks() -> dict[str, Task]:
@@ -16,6 +24,7 @@ def get_all_tasks() -> dict[str, Task]:
 LA = Task(
     name="linguistic-acceptability",
     task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
+    template_dict=LA_TEMPLATES,
     metrics=[
         MetricConfig(
             name="mcc",
@@ -31,12 +40,16 @@ LA = Task(
             compute_kwargs=dict(average="macro"),
         ),
     ],
+    default_num_few_shot_examples=12,
+    default_max_generated_tokens=5,
+    default_labels=["correct", "incorrect"],
 )
 NER = Task(
     name="named-entity-recognition",
     task_group=TaskGroup.TOKEN_CLASSIFICATION,
+    template_dict=NER_TEMPLATES,
     metrics=[
         MetricConfig(
             name="micro_f1_no_misc",
@@ -51,12 +64,26 @@ NER = Task(
             results_key="overall_f1",
         ),
     ],
+    default_num_few_shot_examples=8,
+    default_max_generated_tokens=128,
+    default_labels=[
+        "o",
+        "b-loc",
+        "i-loc",
+        "b-org",
+        "i-org",
+        "b-per",
+        "i-per",
+        "b-misc",
+        "i-misc",
+    ],
 )
 RC = Task(
     name="reading-comprehension",
     task_group=TaskGroup.QUESTION_ANSWERING,
+    template_dict=RC_TEMPLATES,
     metrics=[
         MetricConfig(
             name="f1",
@@ -73,12 +100,16 @@ RC = Task(
             postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:.2f}%"),
         ),
     ],
+    default_num_few_shot_examples=4,
+    default_max_generated_tokens=32,
+    default_labels=["start_positions", "end_positions"],
 )
 SENT = Task(
     name="sentiment-classification",
     task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
+    template_dict=SENT_TEMPLATES,
     metrics=[
         MetricConfig(
             name="mcc",
@@ -94,12 +125,16 @@ SENT = Task(
             compute_kwargs=dict(average="macro"),
         ),
     ],
+    default_num_few_shot_examples=12,
+    default_max_generated_tokens=5,
+    default_labels=["positive", "neutral", "negative"],
 )
 SUMM = Task(
     name="summarization",
     task_group=TaskGroup.TEXT_TO_TEXT,
+    template_dict=SUMM_TEMPLATES,
     metrics=[
         MetricConfig(
             name="bertscore",
@@ -117,12 +152,16 @@ SUMM = Task(
             results_key="rougeL",
         ),
     ],
+    default_num_few_shot_examples=1,
+    default_max_generated_tokens=256,
+    default_labels=[],
 )
 KNOW = Task(
     name="knowledge",
     task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
+    template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[
         MetricConfig(
             name="mcc",
@@ -137,12 +176,16 @@ KNOW = Task(
             results_key="accuracy",
         ),
     ],
+    default_num_few_shot_examples=5,
+    default_max_generated_tokens=5,
+    default_labels=["a", "b", "c", "d"],
 )
 MCRC = Task(
     name="multiple-choice-reading-comprehension",
     task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
+    template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[
         MetricConfig(
             name="mcc",
@@ -157,12 +200,16 @@ MCRC = Task(
             results_key="accuracy",
         ),
     ],
+    default_num_few_shot_examples=5,
+    default_max_generated_tokens=5,
+    default_labels=["a", "b", "c", "d"],
 )
 COMMON_SENSE = Task(
     name="common-sense-reasoning",
     task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
+    template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[
         MetricConfig(
             name="mcc",
@@ -177,12 +224,16 @@ COMMON_SENSE = Task(
             results_key="accuracy",
         ),
     ],
+    default_num_few_shot_examples=5,
+    default_max_generated_tokens=5,
+    default_labels=["a", "b", "c", "d"],
 )
 SPEED = Task(
     name="speed",
     task_group=TaskGroup.SPEED,
+    template_dict={},
     metrics=[
         MetricConfig(
             name="speed",
@@ -199,4 +250,7 @@ SPEED = Task(
             postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
         ),
     ],
+    default_num_few_shot_examples=0,
+    default_max_generated_tokens=5,
+    default_labels=[],
 )

euroeval/tokenization_utils.py ADDED Viewed

@@ -0,0 +1,343 @@
+"""Utility functions related to tokenization."""
+import logging
+import re
+import typing as t
+import torch
+from .constants import TASK_GROUPS_USING_LOGPROBS
+from .exceptions import InvalidModel
+from .utils import log_once
+if t.TYPE_CHECKING:
+    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+    from .data_models import DatasetConfig
+logger = logging.getLogger("euroeval")
+def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
+    """Get the special token metadata for a tokenizer.
+    Args:
+        tokenizer:
+            The tokenizer.
+    Returns:
+        The special token metadata.
+    """
+    # Create some test input IDs, to check if the tokenizer is adding special tokens
+    test_input_ids = tokenizer("Test").input_ids
+    # Extract the CLS token IDs from the tokenizer, if it's using them
+    has_cls_token = True
+    if tokenizer.cls_token_id in test_input_ids:
+        cls_token_id = tokenizer.cls_token_id
+        cls_token = tokenizer.cls_token
+    elif tokenizer.bos_token_id in test_input_ids:
+        cls_token_id = tokenizer.bos_token_id
+        cls_token = tokenizer.bos_token
+    elif tokenizer.cls_token is not None:
+        cls_token_id = tokenizer.cls_token_id
+        cls_token = tokenizer.cls_token
+        has_cls_token = False
+    else:
+        cls_token_id = tokenizer.bos_token_id
+        cls_token = tokenizer.bos_token
+        has_cls_token = False
+    # Extract the SEP token IDs from the tokenizer, if it's using them
+    has_sep_token = True
+    if tokenizer.sep_token_id in test_input_ids:
+        sep_token = tokenizer.sep_token
+    elif tokenizer.eos_token_id in test_input_ids:
+        sep_token = tokenizer.eos_token
+    elif tokenizer.sep_token is not None:
+        sep_token = tokenizer.sep_token
+        has_sep_token = False
+    else:
+        sep_token = tokenizer.eos_token
+        has_sep_token = False
+    return dict(
+        cls_token_id=cls_token_id,
+        cls_token=cls_token,
+        sep_token=sep_token,
+        has_cls_token=has_cls_token,
+        has_sep_token=has_sep_token,
+    )
+def should_prompts_be_stripped(
+    labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
+) -> bool:
+    """Determine if we should strip the prompts for few-shot evaluation.
+    This is the case if the tokenizer needs to include the space as part of the label
+    token. The strategy is thus to tokenize a label with a preceeding colon (as in the
+    prompts), i.e., ": positive", and check if the tokenization starts with the tokens
+    of ": ". If this is the case, then we should not strip the prompts, since the
+    tokenizer produces the whitespace token separately.
+    Args:
+        labels_to_be_generated:
+            The labels that are to be generated.
+        tokenizer:
+            The tokenizer used to tokenize the labels.
+    Returns:
+        Whether we should strip the prompts.
+    """
+    strip_prompts = True
+    for label in labels_to_be_generated:
+        colon_tokens = tokenizer(": ", add_special_tokens=False).input_ids
+        label_tokens = tokenizer(": " + label, add_special_tokens=False).input_ids
+        if isinstance(colon_tokens, torch.Tensor):
+            colon_tokens = list(colon_tokens.squeeze(0))
+        if isinstance(label_tokens, torch.Tensor):
+            label_tokens = list(label_tokens.squeeze(0))
+        label_tokens_start_with_colon_tokens = (
+            label_tokens[: len(colon_tokens)] == colon_tokens
+        )
+        if label_tokens_start_with_colon_tokens:
+            strip_prompts = False
+    return strip_prompts
+def should_prefix_space_be_added_to_labels(
+    labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
+) -> bool:
+    """Determine if we should add a prefix space to the labels.
+    This is the case if the prompts are stripped and the tokenizer doesn't
+    automatically add prefix whitespaces to the labels.
+    Args:
+        labels_to_be_generated:
+            The labels that are to be generated.
+        tokenizer:
+            The tokenizer used to tokenize the labels.
+    Returns:
+        Whether we should add a prefix space to the labels.
+    """
+    if not should_prompts_be_stripped(
+        labels_to_be_generated=labels_to_be_generated, tokenizer=tokenizer
+    ):
+        return False
+    whitespace_token = tokenizer.convert_ids_to_tokens(
+        ids=tokenizer(" ", add_special_tokens=False).input_ids[0]
+    )[0]
+    add_prefix_space = True
+    for label in labels_to_be_generated:
+        label_tokens = tokenizer(label, add_special_tokens=False).input_ids
+        if isinstance(label_tokens, torch.Tensor):
+            label_tokens = list(label_tokens.squeeze(0))
+        first_label_token: int = int(label_tokens[0])
+        first_character_of_label = tokenizer.convert_ids_to_tokens(first_label_token)[0]
+        has_prefix_space = first_character_of_label == whitespace_token
+        if has_prefix_space:
+            add_prefix_space = False
+            break
+    return add_prefix_space
+def get_bos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
+    """Get the beginning-of-sequence token from a tokenizer.
+    Args:
+        tokenizer:
+            The tokenizer.
+    Returns:
+        A pair (token, token_id) representing the beginning-of-sequence token and its
+        token ID.
+    """
+    if isinstance(tokenizer.bos_token, str) and isinstance(tokenizer.bos_token_id, int):
+        return tokenizer.bos_token, tokenizer.bos_token_id
+    vocab: dict[str, int] = tokenizer.get_vocab()
+    candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "[CLS]"]
+    for candidate_bos_token in candidate_bos_tokens:
+        if candidate_bos_token in vocab:
+            bos_token = candidate_bos_token
+            bos_token_id = vocab[bos_token]
+            break
+    else:
+        raise InvalidModel(
+            "The model does not have a beginning-of-sequence token. Please ensure that "
+            "this has been set in the tokenizer's configuration."
+        )
+    return bos_token, bos_token_id
+def get_eos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
+    """Get the end-of-sequence token from a tokenizer.
+    Args:
+        tokenizer:
+            The tokenizer.
+    Returns:
+        A pair (token, token_id) representing the end-of-sequence token and its token
+        ID.
+    """
+    if isinstance(tokenizer.eos_token, str) and isinstance(tokenizer.eos_token_id, int):
+        return tokenizer.eos_token, tokenizer.eos_token_id
+    vocab: dict[str, int] = tokenizer.get_vocab()
+    candidate_eos_tokens = ["</s>", "<|end_of_text|>", "[SEP]"]
+    for candidate_eos_token in candidate_eos_tokens:
+        if candidate_eos_token in vocab:
+            eos_token = candidate_eos_token
+            eos_token_id = vocab[eos_token]
+            break
+    else:
+        raise InvalidModel(
+            "The model does not have an end-of-sequence token. Please ensure that this "
+            "has been set in the tokenizer's configuration."
+        )
+    return eos_token, eos_token_id
+def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | None:
+    """Get the end token ID for chat models.
+    This is only relevant for tokenizers with a chat template.
+    Args:
+        tokenizer:
+            The tokenizer.
+    Returns:
+        The token IDs used to end chats, or None if the tokenizer does not have a chat
+        template.
+    Raises:
+        ValueError:
+            If the end-of-chat token could not be located.
+    """
+    if tokenizer.chat_template is None:
+        return None
+    user_message: dict[str, str] = dict(role="user", content="X")
+    token_ids: list[int] = tokenizer.apply_chat_template(conversation=[user_message])  # type: ignore[assignment]
+    for idx, token in enumerate(tokenizer.convert_ids_to_tokens(token_ids)):
+        token_id = tokenizer.convert_tokens_to_ids(token)
+        assert isinstance(token_id, int)
+        token = tokenizer.decode([token_id])
+        if "X" in token:
+            x_token_index = idx
+            break
+    else:
+        raise ValueError("Could not locate the end-of-chat token for the model.")
+    end_of_chat_tokens = token_ids[x_token_index + 1 :]
+    if len(end_of_chat_tokens) == 0:
+        return None
+    return end_of_chat_tokens
+def get_first_label_token_mapping(
+    dataset_config: "DatasetConfig", tokenizer: "PreTrainedTokenizer | None"
+) -> dict[str, str] | bool:
+    """Check if the model should output scores.
+    Args:
+        dataset_config:
+            The dataset configuration.
+        tokenizer:
+            The tokenizer, or None if not available.
+    Returns:
+        A mapping from labels to the first token in each label, or alternatively a
+        Boolean value indicating whether the model should output scores (if the mapping
+        is outputted then the model will always output scores).
+    """
+    # If we do not have any tokenizer, then we cannot check if the model should output
+    # scores and we just assume it should if the dataset supports it
+    output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
+    if tokenizer is None:
+        if output_scores:
+            log_once(
+                "The model will output scores, since the dataset supports it and no "
+                "tokenizer is available.",
+                level=logging.DEBUG,
+            )
+        else:
+            log_once(
+                "The model will not output scores, since the dataset does not support "
+                "it and no tokenizer is available.",
+                level=logging.DEBUG,
+            )
+        return output_scores
+    # If there are labels associated with the dataset, and that the first token of each
+    # label is distinct, then we can safely use the logprobs
+    if output_scores and dataset_config.labels:
+        local_labels = [
+            dataset_config.prompt_label_mapping[label].strip()
+            for label in dataset_config.labels
+        ]
+        # Get the first token of each label, where we add a prefix space if needed
+        add_prefix_space = (
+            should_prefix_space_be_added_to_labels(
+                labels_to_be_generated=local_labels, tokenizer=tokenizer
+            )
+            and tokenizer.chat_template is None
+        )
+        first_tokens = [
+            tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
+            for label in local_labels
+        ]
+        first_tokens = [
+            re.sub(
+                pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
+            )
+            for token in first_tokens
+        ]
+        # Build a mapping from labels to the first token in each label if the first
+        # tokens are distinct
+        if len(first_tokens) == len(set(first_tokens)):
+            log_once(
+                "The model will output scores, since the first tokens of the labels "
+                "are distinct.",
+                level=logging.DEBUG,
+            )
+            return {
+                label: first_token
+                for label, first_token in zip(local_labels, first_tokens)
+            }
+        else:
+            log_once(
+                "The model will not output scores, since the first tokens of the "
+                "labels are not distinct. The first tokens for the labels "
+                f"{local_labels} are {first_tokens}"
+            )
+            return False
+    # Otherwise, we assume that the model should not output scores, to avoid potential
+    # evaluation errors. This will force the label extraction to rely on word edit
+    # distance instead of logprobs.
+    log_once(
+        "The model will not output scores, since the dataset does not have labels.",
+        level=logging.DEBUG,
+    )
+    return False

euroeval/types.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import typing as t
 from numpy.typing import NDArray
+from transformers.trainer_utils import EvalPrediction
 if t.TYPE_CHECKING:
     from .data_models import GenerativeModelOutput
@@ -18,7 +19,8 @@ class ComputeMetricsFunction(t.Protocol):
     def __call__(
         self,
-        model_outputs_and_labels: tuple[
+        model_outputs_and_labels: EvalPrediction
+        | tuple[
             NDArray | list[str] | list[list[str]], NDArray | list[str] | list[list[str]]
         ],
     ) -> dict[str, float]:

EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.4.2py3-none-any.whl → 15.6.0py3-none-any.whl