PyPI - EuroEval - Versions diffs - 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl - Mend

EuroEval 15.16.0py3-none-any.whl → 16.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show

euroeval/__init__.py +8 -7
euroeval/benchmark_config_factory.py +3 -7
euroeval/benchmark_modules/base.py +35 -19
euroeval/benchmark_modules/fresh.py +24 -19
euroeval/benchmark_modules/hf.py +136 -154
euroeval/benchmark_modules/litellm.py +190 -110
euroeval/benchmark_modules/vllm.py +199 -139
euroeval/benchmarker.py +49 -22
euroeval/cli.py +3 -3
euroeval/constants.py +19 -15
euroeval/data_loading.py +33 -28
euroeval/data_models.py +73 -23
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/danish.py +35 -1
euroeval/dataset_configs/dutch.py +38 -1
euroeval/dataset_configs/english.py +38 -1
euroeval/dataset_configs/estonian.py +95 -0
euroeval/dataset_configs/faroese.py +38 -0
euroeval/dataset_configs/finnish.py +39 -1
euroeval/dataset_configs/french.py +38 -1
euroeval/dataset_configs/german.py +38 -1
euroeval/dataset_configs/icelandic.py +39 -1
euroeval/dataset_configs/italian.py +38 -1
euroeval/dataset_configs/latvian.py +81 -0
euroeval/dataset_configs/norwegian.py +38 -1
euroeval/dataset_configs/portuguese.py +38 -1
euroeval/dataset_configs/spanish.py +38 -1
euroeval/dataset_configs/swedish.py +38 -1
euroeval/enums.py +0 -6
euroeval/finetuning.py +6 -6
euroeval/generation.py +25 -14
euroeval/generation_utils.py +90 -20
euroeval/languages.py +947 -187
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +76 -0
euroeval/metrics/huggingface.py +192 -0
euroeval/metrics/llm_as_a_judge.py +257 -0
euroeval/metrics/pipeline.py +276 -0
euroeval/metrics/speed.py +51 -0
euroeval/model_cache.py +13 -1
euroeval/prompt_templates/linguistic_acceptability.py +40 -2
euroeval/prompt_templates/multiple_choice.py +23 -2
euroeval/prompt_templates/named_entity_recognition.py +65 -2
euroeval/prompt_templates/reading_comprehension.py +42 -2
euroeval/prompt_templates/sentiment_classification.py +46 -2
euroeval/prompt_templates/summarization.py +24 -4
euroeval/scores.py +7 -2
euroeval/speed_benchmark.py +6 -6
euroeval/task_group_utils/multiple_choice_classification.py +19 -8
euroeval/task_group_utils/question_answering.py +35 -28
euroeval/task_group_utils/sequence_classification.py +128 -42
euroeval/task_group_utils/text_to_text.py +7 -3
euroeval/task_group_utils/token_classification.py +59 -73
euroeval/tasks.py +33 -6
euroeval/tokenization_utils.py +294 -207
euroeval/utils.py +150 -35
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
euroeval-16.0.1.dist-info/RECORD +69 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -738
euroeval/metrics.py +0 -470
euroeval-15.16.0.dist-info/RECORD +0 -63
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0

euroeval/prompt_templates/summarization.py CHANGED Viewed

@@ -1,10 +1,15 @@
 """Templates for the Summarization task."""
+import typing as t
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
+from ..languages import DA, DE, EN, ES, ET, FI, FR, IS, IT, LV, NB, NL, NN, NO, PT, SV
+if t.TYPE_CHECKING:
+    from ..data_models import Language
 # TODO: Missing Faroese
-SUMM_TEMPLATES = {
+SUMM_TEMPLATES: dict["Language", PromptConfig] = {
     DA: PromptConfig(
         default_prompt_prefix="Følgende er dokumenter med tilhørende resuméer.",
         default_prompt_template="Dokument: {text}\nResumé: {target_text}",
@@ -32,8 +37,14 @@ SUMM_TEMPLATES = {
         default_prompt_prefix="A continuación se presentan documentos con resúmenes "
         "adjuntos.",
         default_prompt_template="Documento: {text}\nResumen: {target_text}",
-        default_instruction_prompt="Documento: {text}\n\nEscriba un resumen del "
-        "documento anterior.",
+        default_instruction_prompt="Documento: {text}\n\n",
+        default_prompt_label_mapping=dict(),
+    ),
+    ET: PromptConfig(
+        default_prompt_prefix="Allpool on dokumendid koos kokkuvõtetega.",
+        default_prompt_template="Dokument: {text}\nKokkuvõte: {target_text}",
+        default_instruction_prompt="Dokument: {text}\n\nKoosta ülaltoodud dokumendi "
+        "kokkuvõte.",
         default_prompt_label_mapping=dict(),
     ),
     PT: PromptConfig(
@@ -58,6 +69,15 @@ SUMM_TEMPLATES = {
         "document ci-dessus.",
         default_prompt_label_mapping=dict(),
     ),
+    LV: PromptConfig(
+        default_prompt_prefix="Tālāk ir dokumenti ar pievienotām kopsavilkumiem.",
+        default_prompt_template="Dokuments: {text}\nKopsavilkums: {target_text}",
+        default_instruction_prompt=(
+            "Dokuments: {text}\n\n"
+            "Uzrakstiet kopsavilkumu par iepriekš minēto dokumentu."
+        ),
+        default_prompt_label_mapping=dict(),
+    ),
     IS: PromptConfig(
         default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
         default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",

euroeval/scores.py CHANGED Viewed

@@ -52,7 +52,12 @@ def log_scores(
         test_se, test_se_str = metric.postprocessing_fn(test_se)
         total_dict[f"test_{metric.name}"] = test_score
         total_dict[f"test_{metric.name}_se"] = test_se
-        logger.info(f"{metric.pretty_name}: {test_score_str} ± {test_se_str}")
+        log_str = (
+            f"{metric.pretty_name}: {test_score_str} ± {test_se_str}"
+            if not np.isnan(test_se)
+            else f"{metric.pretty_name}: {test_score_str}"
+        )
+        logger.info(log_str)
     return dict(raw=scores, total=total_dict)
@@ -84,7 +89,7 @@ def aggregate_scores(
         if len(test_scores) > 1:
             sample_std = np.std(test_scores, ddof=1)
-            test_se = sample_std / np.sqrt(len(test_scores))
+            test_se = (sample_std / np.sqrt(len(test_scores))).item()
         else:
             test_se = np.nan

euroeval/speed_benchmark.py CHANGED Viewed

@@ -59,7 +59,7 @@ def benchmark_speed_single_iteration(
     Returns:
         A dictionary containing the scores for the current iteration.
     """
-    gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
+    gpt2_tokeniser = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
     base_doc = "Document which contains roughly 10 tokens. "
     multiplier = 10 * (1 + itr_idx)
@@ -74,11 +74,11 @@ def benchmark_speed_single_iteration(
         model.generate(inputs=dict(text=[doc]))
     def encoder_predict(doc: str) -> None:
-        tokenizer = model.get_tokenizer()
+        tokeniser = model.get_tokeniser()
         pytorch_model = model.get_pytorch_module()
         inputs = {
             key: tensor.to(pytorch_model.device)
-            for key, tensor in tokenizer(
+            for key, tensor in tokeniser(
                 text=[doc], truncation=True, return_tensors="pt"
             ).items()
         }
@@ -102,21 +102,21 @@ def benchmark_speed_single_iteration(
         speed_scores = pyinfer.InferenceReport(
             model=predict, inputs=doc, n_seconds=3
         ).run(print_report=False)
-        num_gpt2_tokens = len(gpt2_tokenizer([doc], truncation=True)["input_ids"][0])
+        num_gpt2_tokens = len(gpt2_tokeniser([doc], truncation=True)["input_ids"][0])
         gpt2_tokens_per_second = speed_scores["Infer(p/sec)"] * num_gpt2_tokens
         speed_scores_short = pyinfer.InferenceReport(
             model=predict, inputs=short_doc, n_seconds=3
         ).run(print_report=False)
         num_gpt2_tokens_short = len(
-            gpt2_tokenizer([short_doc], truncation=True)["input_ids"][0]
+            gpt2_tokeniser([short_doc], truncation=True)["input_ids"][0]
         )
         gpt2_tokens_per_second_short = (
             speed_scores_short["Infer(p/sec)"] * num_gpt2_tokens_short
         )
     except (RuntimeError, ValueError, IndexError) as e:
-        raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}")
+        raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}") from e
     return dict(
         test_speed=gpt2_tokens_per_second, test_speed_short=gpt2_tokens_per_second_short

euroeval/task_group_utils/multiple_choice_classification.py CHANGED Viewed

@@ -94,15 +94,15 @@ class MultipleChoiceClassificationTrainer(Trainer):
 def prepare_examples(
-    examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
+    examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
 ) -> "BatchEncoding":
     """Prepare the features.
     Args:
         examples:
             The examples to prepare.
-        tokenizer:
-            The tokenizer to use to prepare the examples.
+        tokeniser:
+            The tokeniser to use to prepare the examples.
     Returns:
         The prepared examples.
@@ -110,12 +110,23 @@ def prepare_examples(
     doc: str = examples["text"][0]
     sections = doc.split("\n")
-    choice_idxs = [
+    candidate_choice_idxs = [
         idx
         for idx, section in enumerate(sections)
-        if re.match(pattern=r"^[a-e]\. ", string=section) is not None
+        if re.match(pattern=r"^[a-z0-9]+\. ", string=section) is not None
     ]
-    choices = [sections[idx] for idx in choice_idxs]
+    # Sometimes the question itself starts with a letter or number followed by a dot, We
+    # want to ignore these cases, and focus on the final contingent block of at least
+    # two choices.
+    choice_idxs: list[int] = list()
+    for idx in reversed(candidate_choice_idxs):
+        if len(choice_idxs) < 2 or (
+            len(choice_idxs) >= 2 and idx == choice_idxs[-1] - 1
+        ):
+            choice_idxs.append(idx)
+    choices = [sections[idx] for idx in reversed(choice_idxs)]
     # Check that the choices are present, and that all of them are at the end
     assert len(choices) > 0, "No choices found in the document."
@@ -127,7 +138,7 @@ def prepare_examples(
     question_idx = min(choice_idxs) - 2  # -2 to remove the 'Choices:' line
     context_and_question = "\n".join(sections[: question_idx + 1]).strip()
-    new_examples = tokenizer(
+    new_examples = tokeniser(
         text=[context_and_question] * len(choices),
         text_pair=[choice[3:] for choice in choices],
         padding=True,
@@ -135,7 +146,7 @@ def prepare_examples(
     )
     new_examples["label"] = [
         int(choice.startswith(f"{letter}. ") and letter == examples["label"][0])
-        for letter, choice in zip("abcde", choices)
+        for letter, choice in zip("abcdefghijklmnopqrstuvwxyz", choices)
     ]
     new_examples["id"] = [hashlib.md5(string=doc.encode()).hexdigest()] * len(choices)
     return new_examples

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -23,7 +23,7 @@ if t.TYPE_CHECKING:
     from transformers.trainer_utils import EvalPrediction
     from transformers.training_args import TrainingArguments
-    from ..data_models import DatasetConfig, GenerativeModelOutput
+    from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
 logger = logging.getLogger("euroeval")
@@ -57,7 +57,7 @@ class QuestionAnsweringTrainer(Trainer):
             **kwargs,
         )
-        # Get the CLS token id for the tokenizer
+        # Get the CLS token id for the tokeniser
         if self.tokenizer is not None:
             assert isinstance(self.tokenizer, PreTrainedTokenizerBase)
             special_token_metadata = get_special_token_metadata(self.tokenizer)
@@ -149,6 +149,7 @@ class QuestionAnsweringTrainer(Trainer):
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
     dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -159,6 +160,8 @@ def compute_metrics(
             contains the true labels.
         dataset_config:
             The configuration of the dataset.
+        benchmark_config:
+            The configuration of the benchmark.
         dataset:
             The dataset used for evaluation. This is only used in case any additional
             metadata is used to compute the metrics.
@@ -186,7 +189,11 @@ def compute_metrics(
     results: dict[str, float] = dict()
     for metric in dataset_config.task.metrics:
         score: float | None = metric(
-            predictions=predictions, references=labels, dataset=dataset
+            predictions=predictions,
+            references=labels,
+            dataset=dataset,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
         )
         # The metric returns None if we are running on multi-GPU and the current
@@ -221,15 +228,15 @@ def extract_labels_from_generation(
 def prepare_train_examples(
-    examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
+    examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
 ) -> "BatchEncoding":
     """Prepare the features for training.
     Args:
         examples:
             The examples to prepare.
-        tokenizer:
-            The tokenizer to use to prepare the examples.
+        tokeniser:
+            The tokeniser to use to prepare the examples.
     Returns:
         The prepared examples.
@@ -239,15 +246,15 @@ def prepare_train_examples(
     # take a lots of space). So we remove that left whitespace
     examples["question"] = [q.lstrip() for q in examples["question"]]
-    # Extract special token metadata from the tokenizer
-    special_token_metadata = get_special_token_metadata(tokenizer=tokenizer)
+    # Extract special token metadata from the tokeniser
+    special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
     has_cls_token = special_token_metadata["has_cls_token"]
     has_sep_token = special_token_metadata["has_sep_token"]
     cls_token_id = special_token_metadata["cls_token_id"]
     cls_token = special_token_metadata["cls_token"]
     sep_token = special_token_metadata["sep_token"]
-    # If the tokenizer is not adding special tokens, then we add them manually
+    # If the tokeniser is not adding special tokens, then we add them manually
     if not has_cls_token and not has_sep_token:
         examples["question"] = [
             f"{cls_token}{q}{sep_token}" for q in examples["question"]
@@ -258,18 +265,18 @@ def prepare_train_examples(
     # split into several features. Since we are always keeping the question tokens, we
     # need to make sure that the stride does not exceed the resulting maximum context
     # length.
-    max_question_tokens = max(len(tokenizer(q).input_ids) for q in examples["question"])
+    max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
     num_special_tokens = int(has_cls_token) + int(has_sep_token)
-    stride = tokenizer.model_max_length // 4
-    max_length = tokenizer.model_max_length - stride
+    stride = tokeniser.model_max_length // 4
+    max_length = tokeniser.model_max_length - stride
     stride = min(stride, max_length - max_question_tokens - num_special_tokens)
-    max_length = tokenizer.model_max_length - stride
+    max_length = tokeniser.model_max_length - stride
     # Tokenize our examples with truncation and padding, but keep the overflows using a
     # stride. This results in one example possible giving several features when a
     # context is long, each of those features having a context that overlaps a bit the
     # context of the previous feature.
-    tokenized_examples = tokenizer(
+    tokenized_examples = tokeniser(
         text=examples["question"],
         text_pair=examples["context"],
         truncation="only_second",
@@ -306,9 +313,9 @@ def prepare_train_examples(
         sequence_ids = tokenized_examples.sequence_ids(i)
         # Manually ensure that the special tokens are set to None in `sequence_ids`
-        for special_token in tokenizer.special_tokens_map.keys():
-            if hasattr(tokenizer, f"{special_token}_id"):
-                special_token_id = getattr(tokenizer, f"{special_token}_id")
+        for special_token in tokeniser.special_tokens_map.keys():
+            if hasattr(tokeniser, f"{special_token}_id"):
+                special_token_id = getattr(tokeniser, f"{special_token}_id")
                 if special_token_id is not None:
                     sequence_ids = [
                         None if token_id == special_token_id else seq_id
@@ -373,15 +380,15 @@ def prepare_train_examples(
 def prepare_test_examples(
-    examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
+    examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
 ) -> "BatchEncoding":
     """Prepare test examples.
     Args:
         examples:
             Dictionary of test examples.
-        tokenizer:
-            The tokenizer used to preprocess the examples.
+        tokeniser:
+            The tokeniser used to preprocess the examples.
     Returns:
         The prepared test examples.
@@ -391,14 +398,14 @@ def prepare_test_examples(
     # take a lots of space). So we remove that left whitespace
     examples["question"] = [q.lstrip() for q in examples["question"]]
-    # Extract special token metadata from the tokenizer
-    special_token_metadata = get_special_token_metadata(tokenizer=tokenizer)
+    # Extract special token metadata from the tokeniser
+    special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
     has_cls_token = special_token_metadata["has_cls_token"]
     has_sep_token = special_token_metadata["has_sep_token"]
     cls_token = special_token_metadata["cls_token"]
     sep_token = special_token_metadata["sep_token"]
-    # If the tokenizer is not adding special tokens, then we add them manually
+    # If the tokeniser is not adding special tokens, then we add them manually
     if not has_cls_token and not has_sep_token:
         examples["question"] = [
             f"{cls_token}{q}{sep_token}" for q in examples["question"]
@@ -409,18 +416,18 @@ def prepare_test_examples(
     # split into several features. Since we are always keeping the question tokens, we
     # need to make sure that the stride does not exceed the resulting maximum context
     # length.
-    max_question_tokens = max(len(tokenizer(q).input_ids) for q in examples["question"])
+    max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
     num_special_tokens = int(has_cls_token) + int(has_sep_token)
-    stride = tokenizer.model_max_length // 4
-    max_length = tokenizer.model_max_length - stride
+    stride = tokeniser.model_max_length // 4
+    max_length = tokeniser.model_max_length - stride
     stride = min(stride, max_length - max_question_tokens - num_special_tokens)
-    max_length = tokenizer.model_max_length - stride
+    max_length = tokeniser.model_max_length - stride
     # Tokenize our examples with truncation and maybe padding, but keep the overflows
     # using a stride. This results in one example possible giving several features when
     # a context is long, each of those features having a context that overlaps a bit
     # the context of the previous feature.
-    tokenized_examples = tokenizer(
+    tokenized_examples = tokeniser(
         text=examples["question"],
         text_pair=examples["context"],
         truncation="only_second",

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -7,14 +7,19 @@ import typing as t
 import Levenshtein
 import numpy as np
+from ..enums import TaskGroup
 from ..exceptions import InvalidBenchmark
-from ..utils import log_once, raise_if_model_output_contains_nan_values
+from ..utils import (
+    extract_multiple_choice_labels,
+    log_once,
+    raise_if_model_output_contains_nan_values,
+)
 if t.TYPE_CHECKING:
     from datasets.arrow_dataset import Dataset
     from transformers.trainer_utils import EvalPrediction
-    from ..data_models import DatasetConfig, GenerativeModelOutput
+    from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
@@ -24,6 +29,7 @@ logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
     dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -34,6 +40,8 @@ def compute_metrics(
             contains the true labels.
         dataset_config:
             The configuration of the dataset.
+        benchmark_config:
+            The configuration of the benchmark.
         dataset:
             The dataset used for evaluation. This is only used in case any additional
             metadata is used to compute the metrics.
@@ -79,7 +87,11 @@ def compute_metrics(
     results: dict[str, float] = dict()
     for metric in dataset_config.task.metrics:
         score: float | None = metric(
-            predictions=predictions, references=label_ids, dataset=dataset
+            predictions=predictions,
+            references=label_ids,
+            dataset=dataset,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
         )
         # The metric returns None if we are running on multi-GPU and the current
@@ -113,7 +125,28 @@ def extract_labels_from_generation(
     Returns:
         The predicted labels.
+    Raises:
+        InvalidBenchmark:
+            If the task requires log probabilities, but the model did not output them,
+            or if the model outputted log probabilities but the first label token
+            mapping is not provided.
     """
+    # Get the candidate labels, which are the labels that the model can predict
+    default_labels = [
+        dataset_config.prompt_label_mapping[lbl]
+        for lbl in dataset_config.id2label.values()
+    ]
+    if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
+        sample_candidate_labels = [
+            extract_multiple_choice_labels(
+                prompt=prompt, candidate_labels=default_labels
+            )
+            for prompt in input_batch["prompt"]
+        ]
+    else:
+        sample_candidate_labels = [default_labels] * len(input_batch["prompt"])
     if model_output.scores is not None:
         if first_label_token_mapping is False:
             raise InvalidBenchmark(
@@ -122,38 +155,85 @@ def extract_labels_from_generation(
             )
         labels = get_closest_logprobs_labels(
             generation_logprobs=model_output.scores,
-            dataset_config=dataset_config,
             first_label_token_mapping=first_label_token_mapping,
+            candidate_labels=sample_candidate_labels,
         )
         if labels is not None:
             return labels
+        elif dataset_config.task.requires_logprobs:
+            raise InvalidBenchmark(
+                "This task requires the model to output logprobs, and this model "
+                "does not seem to be able to do that. Skipping the evaluation."
+            )
-    candidate_labels = [
-        dataset_config.prompt_label_mapping[lbl]
-        for lbl in dataset_config.id2label.values()
-    ]
     new_predicted_labels: list[str] = list()
-    for predicted_label in model_output.sequences:
+    for idx, predicted_label in enumerate(model_output.sequences):
         # If the prediction includes a boxed answer, use that instead of the full
         # generation
         if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
             predicted_label = m.group(1)
-        # Pick the label with the smallest word edit distance to the predicted label
+        # We set the word edit distance weights such that we heavily penalise insertions
+        # and substitutions, so that we don't just insert the correct label, but that we
+        # want the model to have included the correct label in its output.
+        insertion_weight = 1000
+        deletion_weight = 1
+        substitution_weight = 1000
+        # Compute the word edit distances between the predicted label and all candidate
+        # labels
         edit_distances = [
-            Levenshtein.distance(s1=predicted_label.lower(), s2=candidate_label.lower())
-            for candidate_label in candidate_labels
+            Levenshtein.distance(
+                s1=predicted_label.lower(),
+                s2=candidate_label.lower(),
+                weights=(insertion_weight, deletion_weight, substitution_weight),
+            )
+            for candidate_label in sample_candidate_labels[idx]
         ]
-        predicted_label = candidate_labels[np.argmin(edit_distances).item()]
-        new_predicted_labels.append(predicted_label)
+        best_candidate_label = sample_candidate_labels[idx][
+            np.argmin(edit_distances).item()
+        ]
+        # If no candidate labels were found, we either pick the label with the smallest
+        # word edit distance to the predicted label (if invalid model outputs are
+        # allowed), or we raise an error
+        if min(edit_distances) > 100:
+            if dataset_config.task.allow_invalid_model_outputs:
+                logger.warning(
+                    "No candidate labels found for the predicted label "
+                    f"{predicted_label!r}, out of the candidate labels "
+                    f"{sample_candidate_labels[idx]}. This likely means that the model "
+                    "output is completely off, but since invalid model outputs are "
+                    "allowed for this task, we will use the closest candidate label "
+                    f"({best_candidate_label})) as the output label. If you see this "
+                    "warning very often, please report this issue to the EuroEval "
+                    "team at github.com/EuroEval/EuroEval/issues."
+                )
+                logger.debug(
+                    "The candidate labels were extracted from the prompt: "
+                    f"{input_batch['text'][idx]!r}."
+                )
+            else:
+                raise InvalidBenchmark(
+                    "No candidate labels found for the predicted label "
+                    f"{predicted_label!r}, out of the candidate labels "
+                    f"{sample_candidate_labels[idx]}. This likely means that the model "
+                    "output is completely off, and we cannot extract any labels from "
+                    "it. Please check the model output and the candidate labels. The "
+                    "candidate labels were extracted from the prompt: "
+                    f"{input_batch['text'][idx]!r}."
+                )
+        new_predicted_labels.append(best_candidate_label)
     return new_predicted_labels
 def get_closest_logprobs_labels(
     generation_logprobs: list[list[list[tuple[str, float]]]],
-    dataset_config: "DatasetConfig",
     first_label_token_mapping: dict[str, str] | t.Literal[True],
+    candidate_labels: list[list[str]],
 ) -> list[str] | None:
     """Get the labels with the highest predicted logprob value.
@@ -166,11 +246,11 @@ def get_closest_logprobs_labels(
         generation_logprobs:
             The logprobs of the generated tokens, for all samples in the batch. Of shape
             (batch_size, num_tokens, num_logprobs).
-        dataset_config:
-            The configuration of the dataset.
         first_label_token_mapping:
             A mapping from labels to the first token in each label, or alternatively a
             `True` value indicating that the model should output logprobs.
+        candidate_labels:
+            The candidate labels for each sample in the batch.
     Returns:
         The predicted labels, or None if labels could not be extracted.
@@ -179,19 +259,11 @@ def get_closest_logprobs_labels(
         InvalidBenchmark:
             If no candidate label can be found for any of the generated labels.
     """
-    english_labels = list(dataset_config.id2label.values())
-    english2local = dataset_config.prompt_label_mapping
-    candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
     output_labels: list[str] = list()
-    for sample in generation_logprobs:
+    for idx, sample in enumerate(generation_logprobs):
         for logprob_list in sample:
             generated_labels = [
-                re.sub(
-                    pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
-                    repl="",
-                    string=label.lower(),
-                )
+                re.sub(pattern=r"^[^a-zæøåüöä0-9]+$", repl="", string=label.lower())
                 for label, _ in logprob_list
             ]
             generated_labels = [label for label in generated_labels if label != ""]
@@ -206,7 +278,7 @@ def get_closest_logprobs_labels(
                 if isinstance(first_label_token_mapping, dict):
                     if any(
                         candidate_label not in first_label_token_mapping
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                     ):
                         raise InvalidBenchmark(
                             "There is a label not present in the first label token "
@@ -217,16 +289,28 @@ def get_closest_logprobs_labels(
                     candidate_output_labels = {
                         candidate_label
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                         if generated_label == first_label_token_mapping[candidate_label]
                     }
                 else:
                     candidate_output_labels = {
                         candidate_label
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                         if candidate_label.startswith(generated_label)
                     }
+                # If the generated label is a numeral (e.g., "1", "2", "3") and there is
+                # a matching candidate label, we only keep the full match
+                if re.match(r"^\d+$", generated_label) and any(
+                    candidate_label == generated_label
+                    for candidate_label in candidate_output_labels
+                ):
+                    candidate_output_labels = {
+                        candidate_label
+                        for candidate_label in candidate_output_labels
+                        if candidate_label == generated_label
+                    }
                 # If we can uniquely determine the output label, we break the loop.
                 if len(candidate_output_labels) == 1:
                     output_label = candidate_output_labels.pop()
@@ -257,16 +341,18 @@ def get_closest_logprobs_labels(
                 elif len(candidate_output_labels) == 0:
                     candidate_output_labels_starting_with_generated_label = [
                         candidate_label
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                         if candidate_label.startswith(generated_label)
                     ]
                     if candidate_output_labels_starting_with_generated_label:
                         log_once(
                             f"No candidate label found for the generated label "
-                            f"{generated_label!r}. This means that using logprobs to "
-                            "extract the labels is not reliable, and we will instead "
-                            "fall back to extracting the labels using word edit "
-                            "distance.",
+                            f"{generated_label!r}, but there are candidate labels "
+                            f"starting with it: "
+                            f"{candidate_output_labels_starting_with_generated_label}. "
+                            "This means that the first label token mapping is not "
+                            "reliable, and we will instead fall back to extracting "
+                            "the labels using word edit distance.",
                             level=logging.DEBUG,
                         )
                         return None
@@ -291,18 +377,18 @@ def get_closest_logprobs_labels(
             if len(sample) == 0:
                 log_once(
                     "The model outputted an empty string, so no candidate labels could "
-                    f"be determined. Using {candidate_labels[0]!r} as the output "
-                    "label.",
-                    level=logging.DEBUG,
+                    "be determined. Using the first label, "
+                    f"{candidate_labels[idx][0]!r}, as the output label.",
+                    level=logging.INFO,
                 )
             else:
                 log_once(
                     "Could not find a candidate label for any of the generated "
-                    f"labels in the sample {sample}. Using {candidate_labels[0]!r} "
-                    "as the output label.",
-                    level=logging.DEBUG,
+                    f"labels in the sample {sample}. Using the first label, "
+                    f"{candidate_labels[idx][0]!r}, as the output label.",
+                    level=logging.INFO,
                 )
-            output_labels.append(candidate_labels[0])
+            output_labels.append(candidate_labels[idx][0])
     assert len(output_labels) == len(generation_logprobs)
     return output_labels

EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl

Potentially problematic release.

EuroEval 15.16.0py3-none-any.whl → 16.0.1py3-none-any.whl