PyPI - EuroEval - Versions diffs - 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl - Mend

EuroEval 15.16.0py3-none-any.whl → 16.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show

euroeval/__init__.py +3 -7
euroeval/benchmark_config_factory.py +3 -7
euroeval/benchmark_modules/base.py +35 -19
euroeval/benchmark_modules/fresh.py +24 -19
euroeval/benchmark_modules/hf.py +136 -154
euroeval/benchmark_modules/litellm.py +190 -110
euroeval/benchmark_modules/vllm.py +161 -114
euroeval/benchmarker.py +49 -22
euroeval/cli.py +3 -3
euroeval/constants.py +13 -15
euroeval/data_loading.py +33 -28
euroeval/data_models.py +53 -7
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/danish.py +38 -1
euroeval/dataset_configs/dutch.py +38 -1
euroeval/dataset_configs/english.py +38 -1
euroeval/dataset_configs/estonian.py +95 -0
euroeval/dataset_configs/faroese.py +38 -0
euroeval/dataset_configs/finnish.py +39 -1
euroeval/dataset_configs/french.py +38 -1
euroeval/dataset_configs/german.py +38 -1
euroeval/dataset_configs/icelandic.py +39 -1
euroeval/dataset_configs/italian.py +38 -1
euroeval/dataset_configs/latvian.py +81 -0
euroeval/dataset_configs/norwegian.py +38 -1
euroeval/dataset_configs/portuguese.py +38 -1
euroeval/dataset_configs/spanish.py +38 -1
euroeval/dataset_configs/swedish.py +38 -1
euroeval/enums.py +0 -6
euroeval/finetuning.py +6 -6
euroeval/generation.py +25 -14
euroeval/generation_utils.py +46 -14
euroeval/languages.py +947 -187
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +76 -0
euroeval/metrics/huggingface.py +192 -0
euroeval/metrics/llm_as_a_judge.py +257 -0
euroeval/metrics/pipeline.py +234 -0
euroeval/metrics/speed.py +51 -0
euroeval/prompt_templates/linguistic_acceptability.py +40 -2
euroeval/prompt_templates/multiple_choice.py +23 -2
euroeval/prompt_templates/named_entity_recognition.py +65 -2
euroeval/prompt_templates/reading_comprehension.py +42 -2
euroeval/prompt_templates/sentiment_classification.py +46 -2
euroeval/prompt_templates/summarization.py +24 -4
euroeval/scores.py +7 -2
euroeval/speed_benchmark.py +6 -6
euroeval/task_group_utils/multiple_choice_classification.py +17 -6
euroeval/task_group_utils/question_answering.py +35 -28
euroeval/task_group_utils/sequence_classification.py +96 -23
euroeval/task_group_utils/text_to_text.py +7 -3
euroeval/task_group_utils/token_classification.py +47 -75
euroeval/tasks.py +31 -6
euroeval/tokenization_utils.py +295 -207
euroeval/utils.py +118 -34
{euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
euroeval-16.0.0.dist-info/RECORD +69 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -738
euroeval/metrics.py +0 -470
euroeval-15.16.0.dist-info/RECORD +0 -63
{euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0

euroeval/tokenization_utils.py CHANGED Viewed

@@ -5,8 +5,10 @@ import re
 import typing as t
 import torch
+from transformers import MistralCommonTokenizer
+from euroeval.exceptions import InvalidModel
-from .constants import TASK_GROUPS_USING_LOGPROBS
 from .enums import GenerativeType
 from .utils import log_once
@@ -20,47 +22,47 @@ if t.TYPE_CHECKING:
 logger = logging.getLogger("euroeval")
-def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
-    """Get the special token metadata for a tokenizer.
+def get_special_token_metadata(tokeniser: "PreTrainedTokenizerBase") -> dict:
+    """Get the special token metadata for a tokeniser.
     Args:
-        tokenizer:
-            The tokenizer.
+        tokeniser:
+            The tokeniser.
     Returns:
         The special token metadata.
     """
-    # Create some test input IDs, to check if the tokenizer is adding special tokens
-    test_input_ids = tokenizer("Test").input_ids
+    # Create some test input IDs, to check if the tokeniser is adding special tokens
+    test_input_ids = tokeniser("Test").input_ids
-    # Extract the CLS token IDs from the tokenizer, if it's using them
+    # Extract the CLS token IDs from the tokeniser, if it's using them
     has_cls_token = True
-    if tokenizer.cls_token_id in test_input_ids:
-        cls_token_id = tokenizer.cls_token_id
-        cls_token = tokenizer.cls_token
-    elif tokenizer.bos_token_id in test_input_ids:
-        cls_token_id = tokenizer.bos_token_id
-        cls_token = tokenizer.bos_token
-    elif tokenizer.cls_token is not None:
-        cls_token_id = tokenizer.cls_token_id
-        cls_token = tokenizer.cls_token
+    if tokeniser.cls_token_id in test_input_ids:
+        cls_token_id = tokeniser.cls_token_id
+        cls_token = tokeniser.cls_token
+    elif tokeniser.bos_token_id in test_input_ids:
+        cls_token_id = tokeniser.bos_token_id
+        cls_token = tokeniser.bos_token
+    elif tokeniser.cls_token is not None:
+        cls_token_id = tokeniser.cls_token_id
+        cls_token = tokeniser.cls_token
         has_cls_token = False
     else:
-        cls_token_id = tokenizer.bos_token_id
-        cls_token = tokenizer.bos_token
+        cls_token_id = tokeniser.bos_token_id
+        cls_token = tokeniser.bos_token
         has_cls_token = False
-    # Extract the SEP token IDs from the tokenizer, if it's using them
+    # Extract the SEP token IDs from the tokeniser, if it's using them
     has_sep_token = True
-    if tokenizer.sep_token_id in test_input_ids:
-        sep_token = tokenizer.sep_token
-    elif tokenizer.eos_token_id in test_input_ids:
-        sep_token = tokenizer.eos_token
-    elif tokenizer.sep_token is not None:
-        sep_token = tokenizer.sep_token
+    if tokeniser.sep_token_id in test_input_ids:
+        sep_token = tokeniser.sep_token
+    elif tokeniser.eos_token_id in test_input_ids:
+        sep_token = tokeniser.eos_token
+    elif tokeniser.sep_token is not None:
+        sep_token = tokeniser.sep_token
         has_sep_token = False
     else:
-        sep_token = tokenizer.eos_token
+        sep_token = tokeniser.eos_token
         has_sep_token = False
     return dict(
@@ -73,29 +75,29 @@ def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
 def should_prompts_be_stripped(
-    labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
+    labels_to_be_generated: list[str], tokeniser: "PreTrainedTokenizer"
 ) -> bool:
     """Determine if we should strip the prompts for few-shot evaluation.
-    This is the case if the tokenizer needs to include the space as part of the label
+    This is the case if the tokeniser needs to include the space as part of the label
     token. The strategy is thus to tokenize a label with a preceeding colon (as in the
     prompts), i.e., ": positive", and check if the tokenization starts with the tokens
     of ": ". If this is the case, then we should not strip the prompts, since the
-    tokenizer produces the whitespace token separately.
+    tokeniser produces the whitespace token separately.
     Args:
         labels_to_be_generated:
             The labels that are to be generated.
-        tokenizer:
-            The tokenizer used to tokenize the labels.
+        tokeniser:
+            The tokeniser used to tokenize the labels.
     Returns:
         Whether we should strip the prompts.
     """
     strip_prompts = True
     for label in labels_to_be_generated:
-        colon_tokens = tokenizer(": ", add_special_tokens=False).input_ids
-        label_tokens = tokenizer(": " + label, add_special_tokens=False).input_ids
+        colon_tokens = tokeniser(": ", add_special_tokens=False).input_ids
+        label_tokens = tokeniser(": " + label, add_special_tokens=False).input_ids
         if isinstance(colon_tokens, torch.Tensor):
             colon_tokens = list(colon_tokens.squeeze(0))
@@ -112,38 +114,38 @@ def should_prompts_be_stripped(
 def should_prefix_space_be_added_to_labels(
-    labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
+    labels_to_be_generated: list[str], tokeniser: "PreTrainedTokenizer"
 ) -> bool:
     """Determine if we should add a prefix space to the labels.
-    This is the case if the prompts are stripped and the tokenizer doesn't
+    This is the case if the prompts are stripped and the tokeniser doesn't
     automatically add prefix whitespaces to the labels.
     Args:
         labels_to_be_generated:
             The labels that are to be generated.
-        tokenizer:
-            The tokenizer used to tokenize the labels.
+        tokeniser:
+            The tokeniser used to tokenize the labels.
     Returns:
         Whether we should add a prefix space to the labels.
     """
     if not should_prompts_be_stripped(
-        labels_to_be_generated=labels_to_be_generated, tokenizer=tokenizer
+        labels_to_be_generated=labels_to_be_generated, tokeniser=tokeniser
     ):
         return False
-    whitespace_token = tokenizer.convert_ids_to_tokens(
-        ids=tokenizer(" ", add_special_tokens=False).input_ids[0]
+    whitespace_token = tokeniser.convert_ids_to_tokens(
+        ids=tokeniser(" ", add_special_tokens=False).input_ids[0]
     )[0]
     add_prefix_space = True
     for label in labels_to_be_generated:
-        label_tokens = tokenizer(label, add_special_tokens=False).input_ids
+        label_tokens = tokeniser(label, add_special_tokens=False).input_ids
         if isinstance(label_tokens, torch.Tensor):
             label_tokens = list(label_tokens.squeeze(0))
         first_label_token: int = int(label_tokens[0])
-        first_character_of_label = tokenizer.convert_ids_to_tokens(first_label_token)[0]
+        first_character_of_label = tokeniser.convert_ids_to_tokens(first_label_token)[0]
         has_prefix_space = first_character_of_label == whitespace_token
         if has_prefix_space:
             add_prefix_space = False
@@ -153,22 +155,22 @@ def should_prefix_space_be_added_to_labels(
 def get_bos_token(
-    tokenizer: "PreTrainedTokenizer",
+    tokeniser: "PreTrainedTokenizer",
 ) -> tuple[str, int] | tuple[None, None]:
-    """Get the beginning-of-sequence token from a tokenizer.
+    """Get the beginning-of-sequence token from a tokeniser.
     Args:
-        tokenizer:
-            The tokenizer.
+        tokeniser:
+            The tokeniser.
     Returns:
         A pair (token, token_id) representing the beginning-of-sequence token and its
         token ID, or (None, None) if no BOS token is found.
     """
-    if isinstance(tokenizer.bos_token, str) and isinstance(tokenizer.bos_token_id, int):
-        return tokenizer.bos_token, tokenizer.bos_token_id
+    if isinstance(tokeniser.bos_token, str) and isinstance(tokeniser.bos_token_id, int):
+        return tokeniser.bos_token, tokeniser.bos_token_id
-    vocab: dict[str, int] = tokenizer.get_vocab()
+    vocab: dict[str, int] = tokeniser.get_vocab()
     candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "<|startoftext|>", "[CLS]"]
     for candidate_bos_token in candidate_bos_tokens:
@@ -179,7 +181,7 @@ def get_bos_token(
     else:
         log_once(
             "The model does not have a beginning-of-sequence token. Please ensure that "
-            "this has been set in the tokenizer's configuration. Using no BOS token."
+            "this has been set in the tokeniser's configuration. Using no BOS token."
             " This may lead to unexpected behavior in the model.",
             level=logging.INFO,
         )
@@ -194,22 +196,22 @@ def get_bos_token(
 def get_eos_token(
-    tokenizer: "PreTrainedTokenizer",
+    tokeniser: "PreTrainedTokenizer",
 ) -> tuple[str, int] | tuple[None, None]:
-    """Get the end-of-sequence token from a tokenizer.
+    """Get the end-of-sequence token from a tokeniser.
     Args:
-        tokenizer:
-            The tokenizer.
+        tokeniser:
+            The tokeniser.
     Returns:
         A pair (token, token_id) representing the end-of-sequence token and its token
         ID, or (None, None) if no EOS token is found.
     """
-    if isinstance(tokenizer.eos_token, str) and isinstance(tokenizer.eos_token_id, int):
-        return tokenizer.eos_token, tokenizer.eos_token_id
+    if isinstance(tokeniser.eos_token, str) and isinstance(tokeniser.eos_token_id, int):
+        return tokeniser.eos_token, tokeniser.eos_token_id
-    vocab: dict[str, int] = tokenizer.get_vocab()
+    vocab: dict[str, int] = tokeniser.get_vocab()
     candidate_eos_tokens = ["</s>", "<|end_of_text|>", "<|endoftext|>", "[SEP]"]
     for candidate_eos_token in candidate_eos_tokens:
@@ -220,7 +222,7 @@ def get_eos_token(
     else:
         log_once(
             "The model does not have an end-of-sequence token. Please ensure that this "
-            "has been set in the tokenizer's configuration. Using no EOS token. This "
+            "has been set in the tokeniser's configuration. Using no EOS token. This "
             "may lead to unexpected behavior in the model.",
             level=logging.INFO,
         )
@@ -235,55 +237,55 @@ def get_eos_token(
 def get_pad_token(
-    tokenizer: "PreTrainedTokenizer",
+    tokeniser: "PreTrainedTokenizer",
 ) -> tuple[str, int] | tuple[None, None]:
-    """Get the padding token from a tokenizer.
+    """Get the padding token from a tokeniser.
     Args:
-        tokenizer:
-            The tokenizer.
+        tokeniser:
+            The tokeniser.
     Returns:
         A pair (token, token_id) representing the padding token and its token ID, or
         (None, None) if no padding token is found.
     """
-    # If the tokenizer already has a padding token, return it
-    if tokenizer.pad_token is not None and tokenizer.pad_token_id is not None:
-        assert isinstance(tokenizer.pad_token, str), (
-            "Expected tokenizer.pad_token to be a string, but got "
-            f"{type(tokenizer.pad_token)}."
+    # If the tokeniser already has a padding token, return it
+    if tokeniser.pad_token is not None and tokeniser.pad_token_id is not None:
+        assert isinstance(tokeniser.pad_token, str), (
+            "Expected tokeniser.pad_token to be a string, but got "
+            f"{type(tokeniser.pad_token)}."
         )
-        assert isinstance(tokenizer.pad_token_id, int), (
-            "Expected tokenizer.pad_token_id to be an integer, but got "
-            f"{type(tokenizer.pad_token_id)}."
+        assert isinstance(tokeniser.pad_token_id, int), (
+            "Expected tokeniser.pad_token_id to be an integer, but got "
+            f"{type(tokeniser.pad_token_id)}."
         )
-        return (tokenizer.pad_token, tokenizer.pad_token_id)
+        return (tokeniser.pad_token, tokeniser.pad_token_id)
-    # If the tokenizer has a BOS token, use it as the padding token
-    if tokenizer.bos_token is not None and tokenizer.bos_token_id is not None:
-        assert isinstance(tokenizer.bos_token, str), (
-            "Expected tokenizer.bos_token to be a string, but got "
-            f"{type(tokenizer.bos_token)}."
+    # If the tokeniser has a BOS token, use it as the padding token
+    if tokeniser.bos_token is not None and tokeniser.bos_token_id is not None:
+        assert isinstance(tokeniser.bos_token, str), (
+            "Expected tokeniser.bos_token to be a string, but got "
+            f"{type(tokeniser.bos_token)}."
         )
-        assert isinstance(tokenizer.bos_token_id, int), (
-            "Expected tokenizer.bos_token_id to be an integer, but got "
-            f"{type(tokenizer.bos_token_id)}."
+        assert isinstance(tokeniser.bos_token_id, int), (
+            "Expected tokeniser.bos_token_id to be an integer, but got "
+            f"{type(tokeniser.bos_token_id)}."
         )
-        pad_token = tokenizer.bos_token
-        pad_token_id = tokenizer.bos_token_id
-    # If the tokenizer has an EOS token, use it as the padding token
-    elif tokenizer.eos_token is not None and tokenizer.eos_token_id is not None:
-        assert isinstance(tokenizer.eos_token, str), (
-            "Expected tokenizer.eos_token to be a string, but got "
-            f"{type(tokenizer.eos_token)}."
+        pad_token = tokeniser.bos_token
+        pad_token_id = tokeniser.bos_token_id
+    # If the tokeniser has an EOS token, use it as the padding token
+    elif tokeniser.eos_token is not None and tokeniser.eos_token_id is not None:
+        assert isinstance(tokeniser.eos_token, str), (
+            "Expected tokeniser.eos_token to be a string, but got "
+            f"{type(tokeniser.eos_token)}."
         )
-        assert isinstance(tokenizer.eos_token_id, int), (
-            "Expected tokenizer.eos_token_id to be an integer, but got "
-            f"{type(tokenizer.eos_token_id)}."
+        assert isinstance(tokeniser.eos_token_id, int), (
+            "Expected tokeniser.eos_token_id to be an integer, but got "
+            f"{type(tokeniser.eos_token_id)}."
         )
-        pad_token = tokenizer.eos_token
-        pad_token_id = tokenizer.eos_token_id
+        pad_token = tokeniser.eos_token
+        pad_token_id = tokeniser.eos_token_id
     # Otherwise, try to find a candidate padding token in the vocabulary
     else:
@@ -296,14 +298,14 @@ def get_pad_token(
         ]
         pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
         for candidate in pad_token_candidates:
-            if candidate in tokenizer.get_vocab():
+            if candidate in tokeniser.get_vocab():
                 pad_token = candidate
-                pad_token_id = tokenizer.get_vocab()[candidate]
+                pad_token_id = tokeniser.get_vocab()[candidate]
                 break
         else:
             log_once(
                 "Could not identify a padding token for the model. Please ensure that "
-                "this has been set in the tokenizer's configuration. Using no padding "
+                "this has been set in the tokeniser's configuration. Using no padding "
                 "token. This may lead to unexpected behavior in the model.",
                 level=logging.INFO,
             )
@@ -317,50 +319,58 @@ def get_pad_token(
     return pad_token, pad_token_id
-def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | None:
+def get_end_of_chat_token_ids(tokeniser: "PreTrainedTokenizer") -> list[int] | None:
     """Get the end token ID for chat models.
-    This is only relevant for tokenizers with a chat template.
+    This is only relevant for tokenisers with a chat template.
     Args:
-        tokenizer:
-            The tokenizer.
+        tokeniser:
+            The tokeniser.
     Returns:
-        The token IDs used to end chats, or None if the tokenizer does not have a chat
-        template.
-    Raises:
-        ValueError:
-            If the end-of-chat token could not be located.
+        The token IDs used to end chats, or None if the tokeniser does not have a chat
+        template or if no end-of-chat token could be found.
     """
-    if tokenizer.chat_template is None:
+    if not has_chat_template(tokeniser=tokeniser):
         return None
     user_message: dict[str, str] = dict(role="user", content="X")
-    token_ids: list[int] = tokenizer.apply_chat_template(conversation=[user_message])  # type: ignore[assignment]
+    token_ids = apply_chat_template(
+        conversation=[user_message],
+        tokeniser=tokeniser,
+        tokenize=True,
+        add_generation_prompt=False,
+    )
+    assert isinstance(token_ids, list)
-    for idx, token in enumerate(tokenizer.convert_ids_to_tokens(token_ids)):
-        token_id = tokenizer.convert_tokens_to_ids(token)
-        assert isinstance(token_id, int)
-        token = tokenizer.decode([token_id])
+    for idx, token in enumerate(tokeniser.convert_ids_to_tokens(token_ids)):
         if "X" in token:
             x_token_index = idx
             break
     else:
-        raise ValueError("Could not locate the end-of-chat token for the model.")
+        logger.debug("Could not locate the end-of-chat token for the model.")
+        return None
     end_of_chat_tokens = token_ids[x_token_index + 1 :]
     if len(end_of_chat_tokens) == 0:
+        logger.debug("Could not locate the end-of-chat token for the model.")
         return None
+    log_once(
+        f"Detected end-of-chat token IDs as {end_of_chat_tokens}, corresponding to "
+        f"tokens {tokeniser.convert_ids_to_tokens(end_of_chat_tokens)}.",
+        level=logging.DEBUG,
+    )
     return end_of_chat_tokens
 def get_first_label_token_mapping(
     dataset_config: "DatasetConfig",
     model_config: "ModelConfig",
-    tokenizer: "PreTrainedTokenizer | None",
+    tokeniser: "PreTrainedTokenizer | None",
     generative_type: "GenerativeType | None",
+    log_metadata: bool,
 ) -> dict[str, str] | bool:
     """Check if the model should output scores.
@@ -369,130 +379,208 @@ def get_first_label_token_mapping(
             The dataset configuration.
         model_config:
             The model configuration.
-        tokenizer:
-            The tokenizer, or None if not available.
+        tokeniser:
+            The tokeniser, or None if not available.
         generative_type:
             The generative type, or None if not available.
+        log_metadata:
+            Whether to log metadata.
     Returns:
         A mapping from labels to the first token in each label, or alternatively a
         Boolean value indicating whether the model should output scores (if the mapping
         is outputted then the model will always output scores).
     """
-    if generative_type == GenerativeType.REASONING:
-        log_once(
-            f"The model {model_config.model_id!r} is a reasoning model and "
-            "thus does not support logprobs, so we do not enable it.",
-            level=logging.DEBUG,
-        )
+    if not (dataset_config.task.uses_logprobs and dataset_config.labels):
+        if log_metadata:
+            log_once(
+                "We will not use logprobs with the model, since the dataset does not "
+                "have labels.",
+                level=logging.DEBUG,
+            )
         return False
-    # If we do not have any tokenizer, then we cannot check if the model should output
-    # scores and we just assume it should if the dataset supports it
-    output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
-    if tokenizer is None:
-        if output_scores:
+    elif generative_type == GenerativeType.REASONING:
+        if log_metadata:
             log_once(
-                f"We will use logprobs with the model {model_config.model_id!r} "
-                "since the dataset supports it and no tokenizer is available.",
+                f"The model {model_config.model_id!r} is a reasoning model and "
+                "thus does not support logprobs, so we do not enable it.",
                 level=logging.DEBUG,
             )
-        else:
+        return False
+    elif tokeniser is None:
+        if log_metadata:
             log_once(
-                f"We will not use logprobs with the model {model_config.model_id!r} "
-                "since the dataset does not support it and no tokenizer is available.",
+                f"We will use logprobs with the model {model_config.model_id!r} "
+                "since the dataset supports it and no tokeniser is available.",
                 level=logging.DEBUG,
             )
-        return output_scores
-    # If there are labels associated with the dataset, and that the first token of each
-    # label is distinct, then we can safely use the logprobs
-    if output_scores and dataset_config.labels:
-        local_labels = [
-            dataset_config.prompt_label_mapping[label].strip()
-            for label in dataset_config.labels
+        return True
+    local_labels = [
+        dataset_config.prompt_label_mapping[label].strip()
+        for label in dataset_config.labels
+    ]
+    # Tokenize some text containing each label, which we will use to extract the
+    # first token of each label
+    all_tokens: list[list[str]]
+    if not has_chat_template(tokeniser=tokeniser):
+        add_prefix_space = should_prefix_space_be_added_to_labels(
+            labels_to_be_generated=local_labels, tokeniser=tokeniser
+        )
+        all_tokens = [
+            tokeniser.tokenize(text=f" {label}" if add_prefix_space else label)
+            for label in local_labels
         ]
-        # Tokenize some text containing each label, which we will use to extract the
-        # first token of each label
-        all_tokens: list[list[str]]
-        if tokenizer.chat_template is None:
-            add_prefix_space = should_prefix_space_be_added_to_labels(
-                labels_to_be_generated=local_labels, tokenizer=tokenizer
-            )
-            all_tokens = [
-                tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)
-                for label in local_labels
-            ]
-        else:
-            all_tokens = [
-                tokenizer.convert_ids_to_tokens(
-                    ids=tokenizer.apply_chat_template(
-                        conversation=[
-                            dict(role="user", content=""),
-                            dict(role="assistant", content=label),
-                        ],
-                        add_generation_prompt=True,
-                        tokenize=True,
-                    )
-                )
-                for label in local_labels
-            ]
-        # Remove any non-alphabetic characters from the tokens
+    else:
         all_tokens = [
-            [
-                re.sub(
-                    pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
-                    repl="",
-                    string=token.lower(),
+            tokeniser.convert_ids_to_tokens(
+                ids=apply_chat_template(
+                    conversation=[
+                        dict(role="user", content=""),
+                        dict(role="assistant", content=label),
+                        # Adding extra user message as Mistral tokenisers require
+                        # conversamtions to end with a user message
+                        dict(role="user", content=""),
+                    ],
+                    tokeniser=tokeniser,
+                    tokenize=True,
                 )
-                for token in token_list
-            ]
-            for token_list in all_tokens
+            )
+            for label in local_labels
         ]
-        # Extract the first token of each label
-        first_tokens: list[str] = list()
-        for token_list, label in zip(all_tokens, local_labels):
-            matching_tokens = [
-                tok for tok in token_list if tok and label.startswith(tok)
-            ]
-            if not matching_tokens:
+    # Remove any non-alphabetic characters from the tokens
+    all_tokens = [
+        [
+            re.sub(
+                pattern=r"^[^a-zæøåüöä0-9]+|[^a-zæøåüöä0-9]+$",
+                repl="",
+                string=token.lower(),
+            )
+            for token in token_list
+        ]
+        for token_list in all_tokens
+    ]
+    # Extract the first token of each label
+    first_tokens: list[str] = list()
+    for token_list, label in zip(all_tokens, local_labels):
+        matching_tokens = [tok for tok in token_list if tok and label.startswith(tok)]
+        if not matching_tokens:
+            if log_metadata:
                 log_once(
                     f"No matching token found in token_list for label '{label}', so "
                     "we will not use logprobs with the model.",
                     level=logging.DEBUG,
                 )
-                return False
-            first_tokens.append(matching_tokens[0])
-        # Build a mapping from labels to the first token in each label if the first
-        # tokens are distinct
-        if len(first_tokens) == len(set(first_tokens)):
+            return False
+        first_tokens.append(matching_tokens[0])
+    # Build a mapping from labels to the first token in each label if the first
+    # tokens are distinct
+    if len(first_tokens) == len(set(first_tokens)):
+        mapping = {
+            label: first_token for label, first_token in zip(local_labels, first_tokens)
+        }
+        if log_metadata:
             log_once(
-                "We will use logprobs with the model since the first tokens of the "
-                "labels are distinct.",
+                "Using logprobs as evaluation strategy for the model, with the "
+                f"following mapping from labels to their first token: {mapping}.",
                 level=logging.DEBUG,
             )
-            return {
-                label: first_token
-                for label, first_token in zip(local_labels, first_tokens)
-            }
-        else:
+        return mapping
+    else:
+        if log_metadata:
             log_once(
                 "We will not use logprobs with the model since the first tokens of the "
                 "labels are not distinct. The first tokens for the labels "
                 f"{local_labels} are {first_tokens}"
             )
-            return False
+        return False
-    # Otherwise, we assume that the model should not output scores, to avoid potential
-    # evaluation errors. This will force the label extraction to rely on word edit
-    # distance instead of logprobs.
-    log_once(
-        "We will not use logprobs with the model, since the dataset does not have "
-        "labels.",
-        level=logging.DEBUG,
-    )
-    return False
+def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
+    """Check if a tokeniser has a chat template.
+    Args:
+        tokeniser:
+            The tokeniser.
+    Returns:
+        Whether the tokeniser has a chat template.
+    """
+    if hasattr(tokeniser, "chat_template"):
+        has_template = tokeniser.chat_template is not None
+        if has_template:
+            log_once(
+                "The tokeniser has a chat template, so assuming that the model is "
+                "instruction tuned.",
+                level=logging.DEBUG,
+            )
+        return has_template
+    elif isinstance(tokeniser, MistralCommonTokenizer):
+        log_once(
+            "The tokeniser is a Mistral tokeniser, so assuming that the model is "
+            "instruction tuned.",
+            level=logging.DEBUG,
+        )
+        return True
+    else:
+        log_once(
+            "We cannot find a chat template for the tokeniser, so assuming that the "
+            "model isn't instruction tuned.",
+            level=logging.DEBUG,
+        )
+        return False
+def apply_chat_template(
+    conversation: list[dict[str, str]],
+    tokeniser: "PreTrainedTokenizer",
+    tokenize: bool = False,
+    add_generation_prompt: bool = True,
+    **transformers_tokeniser_kwargs,
+) -> str | list[int]:
+    """Apply the chat template to a prompt.
+    Args:
+        conversation:
+            The conversation to apply the chat template to.
+        tokeniser:
+            The tokeniser.
+        tokenize:
+            Whether to tokenize the resulting prompt, returning a list of token IDs
+            instead of a string.
+        add_generation_prompt:
+            Whether to add a generation prompt at the end of the conversation. This is
+            only relevant for regular Hugging Face tokenisers, as Mistral tokenisers
+            always add a generation prompt.
+        **transformers_tokeniser_kwargs:
+            Additional keyword arguments to pass to the tokeniser, in case the tokeniser
+            is a regular Hugging Face tokeniser.
+    Returns:
+        The prompt with the chat template applied, either as a string or a list of
+        token IDs, depending on the value of `tokenize`.
+    Raises:
+        InvalidModel:
+            If the tokeniser does not have a chat template.
+    """
+    if not has_chat_template(tokeniser=tokeniser):
+        raise InvalidModel(
+            "The tokeniser does not have a chat template, so cannot apply it."
+        )
+    elif isinstance(tokeniser, MistralCommonTokenizer):
+        templated_prompt = tokeniser.apply_chat_template(
+            conversation=conversation, tokenize=tokenize
+        )
+    else:
+        templated_prompt = tokeniser.apply_chat_template(
+            conversation=conversation,
+            add_generation_prompt=add_generation_prompt,
+            tokenize=tokenize,
+            **transformers_tokeniser_kwargs,
+        )
+    return templated_prompt

EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.16.0py3-none-any.whl → 16.0.0py3-none-any.whl