PyPI - EuroEval - Versions diffs - 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl - Mend

EuroEval 16.0.0py3-none-any.whl → 16.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (51) hide show

euroeval/__init__.py +5 -0
euroeval/benchmark_config_factory.py +6 -1
euroeval/benchmark_modules/base.py +2 -0
euroeval/benchmark_modules/fresh.py +7 -1
euroeval/benchmark_modules/hf.py +26 -21
euroeval/benchmark_modules/litellm.py +258 -131
euroeval/benchmark_modules/vllm.py +120 -68
euroeval/benchmarker.py +11 -2
euroeval/cli.py +14 -1
euroeval/constants.py +7 -1
euroeval/data_models.py +95 -20
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +14 -3
euroeval/dataset_configs/dutch.py +14 -0
euroeval/dataset_configs/english.py +22 -0
euroeval/dataset_configs/estonian.py +15 -7
euroeval/dataset_configs/finnish.py +14 -0
euroeval/dataset_configs/french.py +14 -0
euroeval/dataset_configs/german.py +23 -0
euroeval/dataset_configs/italian.py +14 -0
euroeval/dataset_configs/latvian.py +14 -0
euroeval/dataset_configs/norwegian.py +14 -0
euroeval/dataset_configs/polish.py +126 -0
euroeval/dataset_configs/portuguese.py +14 -0
euroeval/dataset_configs/spanish.py +14 -0
euroeval/dataset_configs/swedish.py +25 -0
euroeval/enums.py +12 -0
euroeval/generation.py +17 -8
euroeval/generation_utils.py +102 -16
euroeval/metrics/pipeline.py +51 -9
euroeval/model_cache.py +13 -1
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +27 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +11 -0
euroeval/prompt_templates/sentiment_classification.py +15 -0
euroeval/prompt_templates/summarization.py +27 -1
euroeval/scores.py +5 -0
euroeval/task_group_utils/multiple_choice_classification.py +2 -2
euroeval/task_group_utils/question_answering.py +29 -29
euroeval/task_group_utils/sequence_classification.py +71 -81
euroeval/task_group_utils/token_classification.py +17 -3
euroeval/tasks.py +12 -10
euroeval/{tokenization_utils.py → tokenisation_utils.py} +41 -25
euroeval/utils.py +67 -3
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/METADATA +3 -1
euroeval-16.1.0.dist-info/RECORD +70 -0
euroeval-16.0.0.dist-info/RECORD +0 -69
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0

euroeval/prompt_templates/multiple_choice.py CHANGED Viewed

@@ -3,7 +3,25 @@
 import typing as t
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, ET, FI, FR, IS, IT, LV, NB, NL, NN, NO, PT, SV
+from ..languages import (
+    DA,
+    DE,
+    EN,
+    ES,
+    ET,
+    FI,
+    FR,
+    IS,
+    IT,
+    LV,
+    NB,
+    NL,
+    NN,
+    NO,
+    PL,
+    PT,
+    SV,
+)
 if t.TYPE_CHECKING:
     from ..data_models import Language
@@ -123,6 +141,14 @@ MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
         "{labels_str}, og ikke noe annet.",
         default_prompt_label_mapping="auto",
     ),
+    PL: PromptConfig(
+        default_prompt_prefix="Poniżej znajdują się pytania wielokrotnego wyboru "
+        "(z odpowiedziami).",
+        default_prompt_template="Pytanie: {text}\nOdpowiedź: {label}",
+        default_instruction_prompt="Pytanie: {text}\n\nOdpowiedz na powyższe pytanie, "
+        "odpowiadając {labels_str}, i nic więcej.",
+        default_prompt_label_mapping="auto",
+    ),
     SV: PromptConfig(
         default_prompt_prefix="Följande är flervalsfrågor (med svar).",
         default_prompt_template="Fråga: {text}\nSvar: {label}",

euroeval/prompt_templates/named_entity_recognition.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ..languages import (
     NL,
     NN,
     NO,
+    PL,
     PT,
     SV,
 )
@@ -336,6 +337,25 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
         "Verdiene skal være lister over de navngitte enhetene "
         "av den typen, akkurat som de vises i frasen.",
     ),
+    PL: PromptConfig(
+        default_prompt_label_mapping={
+            "b-per": "osoba",
+            "i-per": "osoba",
+            "b-loc": "lokalizacja",
+            "i-loc": "lokalizacja",
+            "b-org": "organizacja",
+            "i-org": "organizacja",
+            "b-misc": "różne",
+            "i-misc": "różne",
+        },
+        default_prompt_prefix="Poniżej znajdują się zdania i słowniki JSON z nazwanymi "
+        "jednostkami występującymi w danym zdaniu.",
+        default_prompt_template="Zdanie: {text}\nNazwane jednostki: {label}",
+        default_instruction_prompt="Zdanie: {text}\n\nZidentyfikuj nazwane jednostki "
+        "w zdaniu. Powinieneś wypisać to jako słownik JSON z kluczami "
+        "{labels_str}. Wartości powinny być listami nazwanych jednostek "
+        "tego typu, dokładnie tak jak pojawiają się w zdaniu.",
+    ),
     SV: PromptConfig(
         default_prompt_label_mapping={
             "b-per": "person",

euroeval/prompt_templates/reading_comprehension.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ..languages import (
     NL,
     NN,
     NO,
+    PL,
     PT,
     SV,
 )
@@ -157,6 +158,16 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
         "teksten ovenfor med maks 3 ord.\n\nSpørsmål: {question}",
         default_prompt_label_mapping=dict(),
     ),
+    PL: PromptConfig(
+        default_prompt_prefix=(
+            "Poniżej znajdują się teksty z towarzyszącymi pytaniami i odpowiedziami."
+        ),
+        default_prompt_template="Tekst: {text}\nPytanie: {question}\nOdpowiedź w "
+        "maksymalnie 3 słowach: {label}",
+        default_instruction_prompt="Tekst: {text}\n\nOdpowiedz na następujące pytanie "
+        "dotyczące powyższego tekstu w maksymalnie 3 słowach.\n\nPytanie: {question}",
+        default_prompt_label_mapping=dict(),
+    ),
     PT: PromptConfig(
         default_prompt_prefix="Os textos que se seguem são acompanhados de perguntas "
         "e respostas.",

euroeval/prompt_templates/sentiment_classification.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ..languages import (
     NL,
     NN,
     NO,
+    PL,
     PT,
     SV,
 )
@@ -78,6 +79,20 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
         "meelestatuse järgi. Võimalikud vastused: {labels_str}. Muud vastused "
         "ei ole lubatud.",
     ),
+    PL: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="pozytywny", neutral="neutralny", negative="negatywny"
+        ),
+        default_prompt_prefix=(
+            "Poniżej znajdują się dokumenty i ich sentyment, który może być "
+            "{labels_str}."
+        ),
+        default_prompt_template="Dokument: {text}\nSentyment: {label}",
+        default_instruction_prompt=(
+            "Dokument: {text}\n\nKlasyfikuj sentyment w dokumencie. "
+            "Odpowiedz z {labels_str}, i nic więcej."
+        ),
+    ),
     PT: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="positivo", neutral="neutro", negative="negativo"

euroeval/prompt_templates/summarization.py CHANGED Viewed

@@ -3,7 +3,25 @@
 import typing as t
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, ET, FI, FR, IS, IT, LV, NB, NL, NN, NO, PT, SV
+from ..languages import (
+    DA,
+    DE,
+    EN,
+    ES,
+    ET,
+    FI,
+    FR,
+    IS,
+    IT,
+    LV,
+    NB,
+    NL,
+    NN,
+    NO,
+    PL,
+    PT,
+    SV,
+)
 if t.TYPE_CHECKING:
     from ..data_models import Language
@@ -122,6 +140,14 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
         "dokumentet ovenfor.",
         default_prompt_label_mapping=dict(),
     ),
+    PL: PromptConfig(
+        default_prompt_prefix="Poniżej znajdują się artykuły z towarzyszącymi "
+        "streszczeniami.",
+        default_prompt_template="Artykuł: {text}\nStreszczenie: {target_text}",
+        default_instruction_prompt="Artykuł: {text}\n\nNapisz streszczenie "
+        "powyższego artykułu.",
+        default_prompt_label_mapping=dict(),
+    ),
     SV: PromptConfig(
         default_prompt_prefix="Nedan följer dokument med tillhörande sammanfattningar.",
         default_prompt_template="Dokument: {text}\nSammanfattning: {target_text}",

euroeval/scores.py CHANGED Viewed

@@ -19,6 +19,7 @@ def log_scores(
     scores: list[dict[str, float]],
     model_id: str,
     model_revision: str,
+    model_param: str | None,
 ) -> "ScoreDict":
     """Log the scores.
@@ -34,6 +35,8 @@ def log_scores(
             The model ID of the model that was evaluated.
         model_revision:
             The revision of the model.
+        model_param:
+            The model parameter, if any.
     Returns:
         A dictionary with keys 'raw_scores' and 'total', with 'raw_scores' being
@@ -42,6 +45,8 @@ def log_scores(
     """
     if model_revision and model_revision != "main":
         model_id += f"@{model_revision}"
+    if model_param is not None:
+        model_id += f"#{model_param}"
     logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")

euroeval/task_group_utils/multiple_choice_classification.py CHANGED Viewed

@@ -126,7 +126,7 @@ def prepare_examples(
         ):
             choice_idxs.append(idx)
-    choices = [sections[idx] for idx in choice_idxs]
+    choices = [sections[idx] for idx in reversed(choice_idxs)]
     # Check that the choices are present, and that all of them are at the end
     assert len(choices) > 0, "No choices found in the document."
@@ -146,7 +146,7 @@ def prepare_examples(
     )
     new_examples["label"] = [
         int(choice.startswith(f"{letter}. ") and letter == examples["label"][0])
-        for letter, choice in zip("abcde", choices)
+        for letter, choice in zip("abcdefghijklmnopqrstuvwxyz", choices)
     ]
     new_examples["id"] = [hashlib.md5(string=doc.encode()).hexdigest()] * len(choices)
     return new_examples

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -10,7 +10,7 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.trainer import Trainer
 from ..exceptions import InvalidBenchmark
-from ..tokenization_utils import get_special_token_metadata
+from ..tokenisation_utils import get_special_token_metadata
 from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
@@ -261,7 +261,7 @@ def prepare_train_examples(
         ]
         examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
-    # Set the stride used during tokenization, when the context is long enough to be
+    # Set the stride used during tokenisation, when the context is long enough to be
     # split into several features. Since we are always keeping the question tokens, we
     # need to make sure that the stride does not exceed the resulting maximum context
     # length.
@@ -272,11 +272,11 @@ def prepare_train_examples(
     stride = min(stride, max_length - max_question_tokens - num_special_tokens)
     max_length = tokeniser.model_max_length - stride
-    # Tokenize our examples with truncation and padding, but keep the overflows using a
+    # Tokenise our examples with truncation and padding, but keep the overflows using a
     # stride. This results in one example possible giving several features when a
     # context is long, each of those features having a context that overlaps a bit the
     # context of the previous feature.
-    tokenized_examples = tokeniser(
+    tokenised_examples = tokeniser(
         text=examples["question"],
         text_pair=examples["context"],
         truncation="only_second",
@@ -290,27 +290,27 @@ def prepare_train_examples(
     # Since one example might give us several features if it has a long context, we
     # need a map from a feature to its corresponding example. This key gives us just
     # that
-    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+    sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
     # The offset mappings will give us a map from token to character position in the
     # original context. This will help us compute the start_positions and
     # end_positions.
-    offset_mapping = tokenized_examples.pop("offset_mapping")
+    offset_mapping = tokenised_examples.pop("offset_mapping")
     # Initialise the start- and end positions of the answers
-    tokenized_examples["start_positions"] = list()
-    tokenized_examples["end_positions"] = list()
+    tokenised_examples["start_positions"] = list()
+    tokenised_examples["end_positions"] = list()
     for i, offsets in enumerate(offset_mapping):
         # Get the input IDs for the current example
-        input_ids = tokenized_examples.input_ids[i]
+        input_ids = tokenised_examples.input_ids[i]
         # We will label impossible answers with the index of the CLS token
         cls_index = input_ids.index(cls_token_id)
         # Grab the sequence corresponding to that example (to know what is the context
         # and what is the question).
-        sequence_ids = tokenized_examples.sequence_ids(i)
+        sequence_ids = tokenised_examples.sequence_ids(i)
         # Manually ensure that the special tokens are set to None in `sequence_ids`
         for special_token in tokeniser.special_tokens_map.keys():
@@ -329,8 +329,8 @@ def prepare_train_examples(
         # If no answers are given, set the cls_index as answer.
         if len(answers["answer_start"]) == 0:
-            tokenized_examples.start_positions.append(cls_index)
-            tokenized_examples.end_positions.append(cls_index)
+            tokenised_examples.start_positions.append(cls_index)
+            tokenised_examples.end_positions.append(cls_index)
         else:
             # Start/end character index of the answer in the text.
@@ -353,8 +353,8 @@ def prepare_train_examples(
                 offsets[token_start_index][0] <= start_char
                 and offsets[token_end_index][1] >= end_char
             ):
-                tokenized_examples.start_positions.append(cls_index)
-                tokenized_examples.end_positions.append(cls_index)
+                tokenised_examples.start_positions.append(cls_index)
+                tokenised_examples.end_positions.append(cls_index)
             # Otherwise move the token_start_index and token_end_index to the two ends
             # of the answer. Note: we could go after the last offset if the answer is
@@ -366,17 +366,17 @@ def prepare_train_examples(
                 ):
                     token_start_index += 1
                 token_start_index -= 1
-                tokenized_examples.start_positions.append(token_start_index)
+                tokenised_examples.start_positions.append(token_start_index)
                 while (
                     token_start_index <= token_end_index
                     and offsets[token_end_index][1] >= end_char
                 ):
                     token_end_index -= 1
                 token_end_index += 1
-                tokenized_examples.end_positions.append(token_end_index)
+                tokenised_examples.end_positions.append(token_end_index)
                 assert token_end_index >= token_start_index
-    return tokenized_examples
+    return tokenised_examples
 def prepare_test_examples(
@@ -394,7 +394,7 @@ def prepare_test_examples(
         The prepared test examples.
     """
     # Some of the questions have lots of whitespace on the left, which is not useful
-    # and will make the truncation of the context fail (the tokenized question will
+    # and will make the truncation of the context fail (the tokenised question will
     # take a lots of space). So we remove that left whitespace
     examples["question"] = [q.lstrip() for q in examples["question"]]
@@ -412,7 +412,7 @@ def prepare_test_examples(
         ]
         examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
-    # Set the stride used during tokenization, when the context is long enough to be
+    # Set the stride used during tokenisation, when the context is long enough to be
     # split into several features. Since we are always keeping the question tokens, we
     # need to make sure that the stride does not exceed the resulting maximum context
     # length.
@@ -423,11 +423,11 @@ def prepare_test_examples(
     stride = min(stride, max_length - max_question_tokens - num_special_tokens)
     max_length = tokeniser.model_max_length - stride
-    # Tokenize our examples with truncation and maybe padding, but keep the overflows
+    # Tokenise our examples with truncation and maybe padding, but keep the overflows
     # using a stride. This results in one example possible giving several features when
     # a context is long, each of those features having a context that overlaps a bit
     # the context of the previous feature.
-    tokenized_examples = tokeniser(
+    tokenised_examples = tokeniser(
         text=examples["question"],
         text_pair=examples["context"],
         truncation="only_second",
@@ -441,30 +441,30 @@ def prepare_test_examples(
     # Since one example might give us several features if it has a long context, we
     # need a map from a feature to its corresponding example. This key gives us just
     # that.
-    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+    sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
     # We keep the id that gave us this feature and we will store the offset mappings.
-    tokenized_examples["id"] = list()
+    tokenised_examples["id"] = list()
-    for i in range(len(tokenized_examples.input_ids)):
+    for i in range(len(tokenised_examples.input_ids)):
         # Grab the sequence corresponding to that example (to know what is the context
         # and what is the question).
-        sequence_ids = tokenized_examples.sequence_ids(i)
+        sequence_ids = tokenised_examples.sequence_ids(i)
         context_index = 1
         # One example can give several spans, this is the index of the example
         # containing this span of text.
         sample_index = sample_mapping[i]
-        tokenized_examples.id.append(examples["id"][sample_index])
+        tokenised_examples.id.append(examples["id"][sample_index])
         # Set to (-1, -1) the offset_mapping that are not part of the context so it's
         # easy to determine if a token position is part of the context or not.
-        tokenized_examples.offset_mapping[i] = [
+        tokenised_examples.offset_mapping[i] = [
             (o if sequence_ids[k] == context_index else (-1, -1))
-            for k, o in enumerate(tokenized_examples.offset_mapping[i])
+            for k, o in enumerate(tokenised_examples.offset_mapping[i])
         ]
-    return tokenized_examples
+    return tokenised_examples
 def postprocess_predictions_and_labels(

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -9,7 +9,11 @@ import numpy as np
 from ..enums import TaskGroup
 from ..exceptions import InvalidBenchmark
-from ..utils import log_once, raise_if_model_output_contains_nan_values
+from ..utils import (
+    extract_multiple_choice_labels,
+    log_once,
+    raise_if_model_output_contains_nan_values,
+)
 if t.TYPE_CHECKING:
     from datasets.arrow_dataset import Dataset
@@ -128,6 +132,21 @@ def extract_labels_from_generation(
             or if the model outputted log probabilities but the first label token
             mapping is not provided.
     """
+    # Get the candidate labels, which are the labels that the model can predict
+    default_labels = [
+        dataset_config.prompt_label_mapping[lbl]
+        for lbl in dataset_config.id2label.values()
+    ]
+    if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
+        sample_candidate_labels = [
+            extract_multiple_choice_labels(
+                prompt=prompt, candidate_labels=default_labels
+            )
+            for prompt in input_batch["prompt"]
+        ]
+    else:
+        sample_candidate_labels = [default_labels] * len(input_batch["prompt"])
     if model_output.scores is not None:
         if first_label_token_mapping is False:
             raise InvalidBenchmark(
@@ -136,8 +155,8 @@ def extract_labels_from_generation(
             )
         labels = get_closest_logprobs_labels(
             generation_logprobs=model_output.scores,
-            dataset_config=dataset_config,
             first_label_token_mapping=first_label_token_mapping,
+            candidate_labels=sample_candidate_labels,
         )
         if labels is not None:
             return labels
@@ -147,31 +166,8 @@ def extract_labels_from_generation(
                 "does not seem to be able to do that. Skipping the evaluation."
             )
-    # Get the candidate labels, which are the labels that the model can predict
-    candidate_labels = [
-        dataset_config.prompt_label_mapping[lbl]
-        for lbl in dataset_config.id2label.values()
-    ]
     new_predicted_labels: list[str] = list()
     for idx, predicted_label in enumerate(model_output.sequences):
-        # Special case if we are doing multiple choice classification: we in this case
-        # dynamically change the candidate labels to the labels mentioned in the prompt
-        if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
-            prompt = input_batch["text"][idx]
-            sample_candidate_labels = [
-                candidate_label
-                for candidate_label in candidate_labels
-                if re.search(
-                    pattern=rf"\b{candidate_label}. ",
-                    string=prompt,
-                    flags=re.IGNORECASE,
-                )
-                is not None
-            ]
-        else:
-            sample_candidate_labels = candidate_labels
         # If the prediction includes a boxed answer, use that instead of the full
         # generation
         if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
@@ -192,22 +188,43 @@ def extract_labels_from_generation(
                 s2=candidate_label.lower(),
                 weights=(insertion_weight, deletion_weight, substitution_weight),
             )
-            for candidate_label in sample_candidate_labels
+            for candidate_label in sample_candidate_labels[idx]
         ]
-        # If no candidate labels were found, we assume that something is wrong with the
-        # model output, and we raise an error
+        best_candidate_label = sample_candidate_labels[idx][
+            np.argmin(edit_distances).item()
+        ]
+        # If no candidate labels were found, we either pick the label with the smallest
+        # word edit distance to the predicted label (if invalid model outputs are
+        # allowed), or we raise an error
         if min(edit_distances) > 100:
-            raise InvalidBenchmark(
-                "No candidate labels found for the predicted label "
-                f"{predicted_label!r}, out of the candidate labels "
-                f"{sample_candidate_labels}. This likely means that the model output "
-                "is completely off, and we cannot extract any labels from it. Please "
-                "check the model output and the candidate labels."
-            )
+            if dataset_config.allow_invalid_model_outputs:
+                logger.warning(
+                    "No candidate labels found for the predicted label "
+                    f"{predicted_label!r}, out of the candidate labels "
+                    f"{sample_candidate_labels[idx]}. This likely means that the model "
+                    "output is completely off, but since invalid model outputs are "
+                    "allowed for this task, we will use the closest candidate label "
+                    f"({best_candidate_label})) as the output label. If you see this "
+                    "warning very often, please report this issue to the EuroEval "
+                    "team at github.com/EuroEval/EuroEval/issues."
+                )
+                logger.debug(
+                    "The candidate labels were extracted from the prompt: "
+                    f"{input_batch['text'][idx]!r}."
+                )
+            else:
+                raise InvalidBenchmark(
+                    "No candidate labels found for the predicted label "
+                    f"{predicted_label!r}, out of the candidate labels "
+                    f"{sample_candidate_labels[idx]}. This likely means that the model "
+                    "output is completely off, and we cannot extract any labels from "
+                    "it. Please check the model output and the candidate labels. The "
+                    "candidate labels were extracted from the prompt: "
+                    f"{input_batch['text'][idx]!r}."
+                )
-        # Pick the label with the smallest word edit distance to the predicted label
-        best_candidate_label = sample_candidate_labels[np.argmin(edit_distances).item()]
         new_predicted_labels.append(best_candidate_label)
     return new_predicted_labels
@@ -215,8 +232,8 @@ def extract_labels_from_generation(
 def get_closest_logprobs_labels(
     generation_logprobs: list[list[list[tuple[str, float]]]],
-    dataset_config: "DatasetConfig",
     first_label_token_mapping: dict[str, str] | t.Literal[True],
+    candidate_labels: list[list[str]],
 ) -> list[str] | None:
     """Get the labels with the highest predicted logprob value.
@@ -229,11 +246,11 @@ def get_closest_logprobs_labels(
         generation_logprobs:
             The logprobs of the generated tokens, for all samples in the batch. Of shape
             (batch_size, num_tokens, num_logprobs).
-        dataset_config:
-            The configuration of the dataset.
         first_label_token_mapping:
             A mapping from labels to the first token in each label, or alternatively a
             `True` value indicating that the model should output logprobs.
+        candidate_labels:
+            The candidate labels for each sample in the batch.
     Returns:
         The predicted labels, or None if labels could not be extracted.
@@ -242,12 +259,8 @@ def get_closest_logprobs_labels(
         InvalidBenchmark:
             If no candidate label can be found for any of the generated labels.
     """
-    english_labels = list(dataset_config.id2label.values())
-    english2local = dataset_config.prompt_label_mapping
-    candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
     output_labels: list[str] = list()
-    for sample in generation_logprobs:
+    for idx, sample in enumerate(generation_logprobs):
         for logprob_list in sample:
             generated_labels = [
                 re.sub(pattern=r"^[^a-zæøåüöä0-9]+$", repl="", string=label.lower())
@@ -265,7 +278,7 @@ def get_closest_logprobs_labels(
                 if isinstance(first_label_token_mapping, dict):
                     if any(
                         candidate_label not in first_label_token_mapping
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                     ):
                         raise InvalidBenchmark(
                             "There is a label not present in the first label token "
@@ -276,26 +289,14 @@ def get_closest_logprobs_labels(
                     candidate_output_labels = {
                         candidate_label
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                         if generated_label == first_label_token_mapping[candidate_label]
                     }
                 else:
                     candidate_output_labels = {
                         candidate_label
-                        for candidate_label in candidate_labels
-                        if candidate_label.startswith(generated_label)
-                    }
-                # If the generated label is a numeral (e.g., "1", "2", "3") and there is
-                # a matching candidate label, we only keep the full match
-                if re.match(r"^\d+$", generated_label) and any(
-                    candidate_label == generated_label
-                    for candidate_label in candidate_output_labels
-                ):
-                    candidate_output_labels = {
-                        candidate_label
-                        for candidate_label in candidate_output_labels
-                        if candidate_label == generated_label
+                        for candidate_label in candidate_labels[idx]
+                        if candidate_label.startswith(generated_label.strip())
                     }
                 # If we can uniquely determine the output label, we break the loop.
@@ -328,7 +329,7 @@ def get_closest_logprobs_labels(
                 elif len(candidate_output_labels) == 0:
                     candidate_output_labels_starting_with_generated_label = [
                         candidate_label
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                         if candidate_label.startswith(generated_label)
                     ]
                     if candidate_output_labels_starting_with_generated_label:
@@ -344,19 +345,6 @@ def get_closest_logprobs_labels(
                         )
                         return None
-            # If we did not find any candidate label for any of the generated labels, we
-            # assume that something is wrong with the model output, and we fall back to
-            # using word edit distance to extract the labels
-            else:
-                log_once(
-                    f"No candidate label found for any of the generated labels "
-                    f"{generated_labels}. This means that using logprobs to extract "
-                    "the labels is not reliable, and we will instead fall back to "
-                    "extracting the labels using word edit distance.",
-                    level=logging.DEBUG,
-                )
-                return None
             if output_label is not None:
                 output_labels.append(output_label)
                 break
@@ -364,18 +352,20 @@ def get_closest_logprobs_labels(
             if len(sample) == 0:
                 log_once(
                     "The model outputted an empty string, so no candidate labels could "
-                    f"be determined. Using the first label, {candidate_labels[0]!r}, "
-                    "as the output label.",
+                    "be determined. This means that using logprobs to extract the "
+                    "labels is not reliable, and we will instead fall back to "
+                    "extracting the labels using word edit distance.",
                     level=logging.INFO,
                 )
             else:
                 log_once(
-                    "Could not find a candidate label for any of the generated "
-                    f"labels in the sample {sample}. Using the first label, "
-                    f"{candidate_labels[0]!r}, as the output label.",
+                    "No candidate label found for any of the generated labels, which "
+                    "means that using logprobs to extract the labels is not reliable, "
+                    "and we will instead fall back to extracting the labels using "
+                    "word edit distance.",
                     level=logging.INFO,
                 )
-            output_labels.append(candidate_labels[0])
+            return None
     assert len(output_labels) == len(generation_logprobs)
     return output_labels

EuroEval 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.0.0py3-none-any.whl → 16.1.0py3-none-any.whl