PyPI - EuroEval - Versions diffs - 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl - Mend

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

euroeval/__init__.py +32 -14
euroeval/benchmark_config_factory.py +92 -180
euroeval/benchmark_modules/base.py +49 -39
euroeval/benchmark_modules/fresh.py +35 -21
euroeval/benchmark_modules/hf.py +280 -244
euroeval/benchmark_modules/litellm.py +752 -312
euroeval/benchmark_modules/vllm.py +570 -268
euroeval/benchmarker.py +651 -528
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +49 -38
euroeval/constants.py +44 -25
euroeval/data_loading.py +111 -55
euroeval/data_models.py +490 -323
euroeval/dataset_configs/__init__.py +26 -4
euroeval/dataset_configs/bosnian.py +39 -0
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/croatian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +78 -50
euroeval/dataset_configs/dutch.py +74 -44
euroeval/dataset_configs/english.py +71 -36
euroeval/dataset_configs/estonian.py +111 -0
euroeval/dataset_configs/faroese.py +25 -18
euroeval/dataset_configs/finnish.py +63 -26
euroeval/dataset_configs/french.py +65 -32
euroeval/dataset_configs/german.py +77 -36
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +68 -57
euroeval/dataset_configs/italian.py +68 -36
euroeval/dataset_configs/latvian.py +87 -0
euroeval/dataset_configs/lithuanian.py +64 -0
euroeval/dataset_configs/norwegian.py +98 -72
euroeval/dataset_configs/polish.py +96 -0
euroeval/dataset_configs/portuguese.py +63 -40
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/slovene.py +56 -0
euroeval/dataset_configs/spanish.py +68 -34
euroeval/dataset_configs/swedish.py +82 -41
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/enums.py +12 -6
euroeval/exceptions.py +21 -1
euroeval/finetuning.py +34 -26
euroeval/generation.py +76 -41
euroeval/generation_utils.py +169 -34
euroeval/languages.py +1020 -188
euroeval/logging_utils.py +268 -0
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +85 -0
euroeval/metrics/huggingface.py +216 -0
euroeval/metrics/llm_as_a_judge.py +260 -0
euroeval/metrics/pipeline.py +289 -0
euroeval/metrics/speed.py +48 -0
euroeval/model_cache.py +40 -21
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +157 -22
euroeval/prompt_templates/multiple_choice.py +159 -17
euroeval/prompt_templates/named_entity_recognition.py +318 -21
euroeval/prompt_templates/reading_comprehension.py +207 -16
euroeval/prompt_templates/sentiment_classification.py +205 -22
euroeval/prompt_templates/summarization.py +122 -22
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +20 -9
euroeval/speed_benchmark.py +11 -12
euroeval/task_group_utils/multiple_choice_classification.py +21 -12
euroeval/task_group_utils/question_answering.py +101 -73
euroeval/task_group_utils/sequence_classification.py +144 -61
euroeval/task_group_utils/text_to_text.py +33 -12
euroeval/task_group_utils/token_classification.py +86 -89
euroeval/tasks.py +75 -16
euroeval/tokenisation_utils.py +603 -0
euroeval/types.py +17 -11
euroeval/utils.py +332 -137
euroeval-16.7.1.dist-info/METADATA +623 -0
euroeval-16.7.1.dist-info/RECORD +84 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -737
euroeval/metrics.py +0 -452
euroeval/tokenization_utils.py +0 -498
euroeval-15.12.0.dist-info/METADATA +0 -285
euroeval-15.12.0.dist-info/RECORD +0 -63
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Utility functions related to the sequence-classification task group."""
+import collections.abc as c
 import logging
 import re
 import typing as t
@@ -7,22 +8,32 @@ import typing as t
 import Levenshtein
 import numpy as np
+from ..enums import TaskGroup
 from ..exceptions import InvalidBenchmark
-from ..utils import log_once, raise_if_model_output_contains_nan_values
+from ..utils import (
+    extract_multiple_choice_labels,
+    log_once,
+    raise_if_model_output_contains_nan_values,
+)
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from transformers.trainer_utils import EvalPrediction
-    from ..data_models import DatasetConfig, GenerativeModelOutput
+    from ..data_models import (
+        BenchmarkConfig,
+        DatasetConfig,
+        GenerativeModelOutput,
+        ModelConfig,
+    )
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
+    dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -32,6 +43,11 @@ def compute_metrics(
             contains the true labels.
         dataset_config:
             The configuration of the dataset.
+        benchmark_config:
+            The configuration of the benchmark.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -73,7 +89,13 @@ def compute_metrics(
     results: dict[str, float] = dict()
     for metric in dataset_config.task.metrics:
-        score: float | None = metric(predictions=predictions, references=label_ids)
+        score: float | None = metric(
+            predictions=predictions,
+            references=label_ids,
+            dataset=dataset,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
+        )
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process
@@ -87,8 +109,9 @@ def extract_labels_from_generation(
     input_batch: dict[str, list],
     model_output: "GenerativeModelOutput",
     dataset_config: "DatasetConfig",
+    model_config: "ModelConfig",
     first_label_token_mapping: dict[str, str] | bool,
-) -> list[str]:
+) -> c.Sequence[str]:
     """Extract the predicted labels from the generated output.
     Args:
@@ -99,6 +122,8 @@ def extract_labels_from_generation(
             The raw generated output of the model.
         dataset_config:
             The configuration of the dataset.
+        model_config:
+            The configuration of the model.
         first_label_token_mapping:
             A mapping from labels to the first token in each label, or alternatively a
             Boolean value indicating whether the model should output scores (if the
@@ -106,7 +131,28 @@ def extract_labels_from_generation(
     Returns:
         The predicted labels.
+    Raises:
+        InvalidBenchmark:
+            If the task requires log probabilities, but the model did not output them,
+            or if the model outputted log probabilities but the first label token
+            mapping is not provided.
     """
+    # Get the candidate labels, which are the labels that the model can predict
+    default_labels = [
+        dataset_config.prompt_label_mapping[lbl]
+        for lbl in dataset_config.id2label.values()
+    ]
+    if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
+        sample_candidate_labels = [
+            extract_multiple_choice_labels(
+                prompt=prompt, candidate_labels=default_labels
+            )
+            for prompt in input_batch["prompt"]
+        ]
+    else:
+        sample_candidate_labels = [default_labels] * len(input_batch["prompt"])
     if model_output.scores is not None:
         if first_label_token_mapping is False:
             raise InvalidBenchmark(
@@ -115,39 +161,93 @@ def extract_labels_from_generation(
             )
         labels = get_closest_logprobs_labels(
             generation_logprobs=model_output.scores,
-            dataset_config=dataset_config,
             first_label_token_mapping=first_label_token_mapping,
+            candidate_labels=sample_candidate_labels,
         )
         if labels is not None:
             return labels
+        elif dataset_config.task.requires_logprobs:
+            raise InvalidBenchmark(
+                "This task requires the model to output logprobs, and this model "
+                "does not seem to be able to do that. Skipping the evaluation."
+            )
-    candidate_labels = [
-        dataset_config.prompt_label_mapping[lbl]
-        for lbl in dataset_config.id2label.values()
-    ]
     new_predicted_labels: list[str] = list()
-    for predicted_label in model_output.sequences:
+    num_predictions_being_very_off = 0
+    for idx, predicted_label in enumerate(model_output.sequences):
         # If the prediction includes a boxed answer, use that instead of the full
         # generation
         if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
             predicted_label = m.group(1)
-        # Pick the label with the smallest word edit distance to the predicted label
+        # We set the word edit distance weights such that we heavily penalise insertions
+        # and substitutions, so that we don't just insert the correct label, but that we
+        # want the model to have included the correct label in its output.
+        insertion_weight = 1000
+        deletion_weight = 1
+        substitution_weight = 1000
+        # Compute the word edit distances between the predicted label and all candidate
+        # labels
         edit_distances = [
-            Levenshtein.distance(s1=predicted_label.lower(), s2=candidate_label.lower())
-            for candidate_label in candidate_labels
+            Levenshtein.distance(
+                s1=predicted_label.lower(),
+                s2=candidate_label.lower(),
+                weights=(insertion_weight, deletion_weight, substitution_weight),
+            )
+            for candidate_label in sample_candidate_labels[idx]
         ]
-        predicted_label = candidate_labels[np.argmin(edit_distances).item()]
-        new_predicted_labels.append(predicted_label)
+        best_candidate_label = sample_candidate_labels[idx][
+            np.argmin(edit_distances).item()
+        ]
+        # If no candidate labels were found, we either pick the label with the smallest
+        # word edit distance to the predicted label (if invalid model outputs are
+        # allowed), or we raise an error
+        if min(edit_distances) >= 1000:
+            num_predictions_being_very_off += 1
+        new_predicted_labels.append(best_candidate_label)
+    if num_predictions_being_very_off > 0:
+        if dataset_config.allow_invalid_model_outputs:
+            log_msg = (
+                "No candidate labels found for the predicted label in "
+                f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
+                f"of the samples with the model {model_config.model_id!r}. This "
+                "likely means that the model were completely off in these cases, "
+                "but since invalid model outputs are allowed for this task, we used "
+                "the closest candidate labels as the output labels."
+            )
+            level = logging.DEBUG
+            if num_predictions_being_very_off / len(model_output.sequences) > 0.5:
+                log_msg += (
+                    " Since this happened for most of the model's predictions, please "
+                    "report this issue to the EuroEval team at "
+                    "github.com/EuroEval/EuroEval/issues."
+                )
+                level = logging.WARNING
+            log_once(log_msg, level=level)
+        else:
+            raise InvalidBenchmark(
+                "No candidate labels found for the predicted label in "
+                f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
+                "of the samples. This likely means that the model were completely "
+                "off in these cases. Since this task does not allow invalid model "
+                "outputs, we have to abort the evaluation. Please re-run the "
+                "evaluation with the `--debug` flag (or `debug=True` if you're using "
+                "the `Benchmarker` API) to see the precise model outputs."
+            )
     return new_predicted_labels
 def get_closest_logprobs_labels(
-    generation_logprobs: list[list[list[tuple[str, float]]]],
-    dataset_config: "DatasetConfig",
+    generation_logprobs: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]],
     first_label_token_mapping: dict[str, str] | t.Literal[True],
-) -> list[str] | None:
+    candidate_labels: c.Sequence[c.Sequence[str]],
+) -> c.Sequence[str] | None:
     """Get the labels with the highest predicted logprob value.
     In case a candidate label is split into multiple tokens, we only use the first
@@ -159,11 +259,11 @@ def get_closest_logprobs_labels(
         generation_logprobs:
             The logprobs of the generated tokens, for all samples in the batch. Of shape
             (batch_size, num_tokens, num_logprobs).
-        dataset_config:
-            The configuration of the dataset.
         first_label_token_mapping:
             A mapping from labels to the first token in each label, or alternatively a
             `True` value indicating that the model should output logprobs.
+        candidate_labels:
+            The candidate labels for each sample in the batch.
     Returns:
         The predicted labels, or None if labels could not be extracted.
@@ -172,19 +272,11 @@ def get_closest_logprobs_labels(
         InvalidBenchmark:
             If no candidate label can be found for any of the generated labels.
     """
-    english_labels = list(dataset_config.id2label.values())
-    english2local = dataset_config.prompt_label_mapping
-    candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
     output_labels: list[str] = list()
-    for sample in generation_logprobs:
+    for idx, sample in enumerate(generation_logprobs):
         for logprob_list in sample:
             generated_labels = [
-                re.sub(
-                    pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
-                    repl="",
-                    string=label.lower(),
-                )
+                re.sub(pattern=r"^[^a-zæøåüöä0-9]+$", repl="", string=label.lower())
                 for label, _ in logprob_list
             ]
             generated_labels = [label for label in generated_labels if label != ""]
@@ -199,7 +291,7 @@ def get_closest_logprobs_labels(
                 if isinstance(first_label_token_mapping, dict):
                     if any(
                         candidate_label not in first_label_token_mapping
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                     ):
                         raise InvalidBenchmark(
                             "There is a label not present in the first label token "
@@ -210,14 +302,14 @@ def get_closest_logprobs_labels(
                     candidate_output_labels = {
                         candidate_label
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                         if generated_label == first_label_token_mapping[candidate_label]
                     }
                 else:
                     candidate_output_labels = {
                         candidate_label
-                        for candidate_label in candidate_labels
-                        if candidate_label.startswith(generated_label)
+                        for candidate_label in candidate_labels[idx]
+                        if candidate_label.startswith(generated_label.strip())
                     }
                 # If we can uniquely determine the output label, we break the loop.
@@ -250,33 +342,22 @@ def get_closest_logprobs_labels(
                 elif len(candidate_output_labels) == 0:
                     candidate_output_labels_starting_with_generated_label = [
                         candidate_label
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                         if candidate_label.startswith(generated_label)
                     ]
                     if candidate_output_labels_starting_with_generated_label:
                         log_once(
                             f"No candidate label found for the generated label "
-                            f"{generated_label!r}. This means that using logprobs to "
-                            "extract the labels is not reliable, and we will instead "
-                            "fall back to extracting the labels using word edit "
-                            "distance.",
+                            f"{generated_label!r}, but there are candidate labels "
+                            f"starting with it: "
+                            f"{candidate_output_labels_starting_with_generated_label}. "
+                            "This means that the first label token mapping is not "
+                            "reliable, and we will instead fall back to extracting "
+                            "the labels using word edit distance.",
                             level=logging.DEBUG,
                         )
                         return None
-            # If we did not find any candidate label for any of the generated labels, we
-            # assume that something is wrong with the model output, and we fall back to
-            # using word edit distance to extract the labels
-            else:
-                log_once(
-                    f"No candidate label found for any of the generated labels "
-                    f"{generated_labels}. This means that using logprobs to extract "
-                    "the labels is not reliable, and we will instead fall back to "
-                    "extracting the labels using word edit distance.",
-                    level=logging.DEBUG,
-                )
-                return None
             if output_label is not None:
                 output_labels.append(output_label)
                 break
@@ -284,18 +365,20 @@ def get_closest_logprobs_labels(
             if len(sample) == 0:
                 log_once(
                     "The model outputted an empty string, so no candidate labels could "
-                    f"be determined. Using {candidate_labels[0]!r} as the output "
-                    "label.",
+                    "be determined. This means that using logprobs to extract the "
+                    "labels is not reliable, and we will instead fall back to "
+                    "extracting the labels using word edit distance.",
                     level=logging.DEBUG,
                 )
             else:
                 log_once(
-                    "Could not find a candidate label for any of the generated "
-                    f"labels in the sample {sample}. Using {candidate_labels[0]!r} "
-                    "as the output label.",
+                    "No candidate label found for any of the generated labels, which "
+                    "means that using logprobs to extract the labels is not reliable, "
+                    "and we will instead fall back to extracting the labels using "
+                    "word edit distance.",
                     level=logging.DEBUG,
                 )
-            output_labels.append(candidate_labels[0])
+            return None
     assert len(output_labels) == len(generation_logprobs)
     return output_labels

euroeval/task_group_utils/text_to_text.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Utility functions related to the text-to-text task group."""
+import collections.abc as c
 import logging
 import typing as t
@@ -7,23 +8,23 @@ import numpy as np
 from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
 from ..exceptions import InvalidBenchmark
+from ..logging_utils import log
 from ..metrics import HuggingFaceMetric
 from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from transformers.trainer_utils import EvalPrediction
     from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
+    dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -35,10 +36,17 @@ def compute_metrics(
             The configuration of the dataset.
         benchmark_config:
             The configuration of the benchmark.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
         values.
+    Raises:
+        InvalidBenchmark:
+            If the metric computation fails.
     """
     model_outputs, labels = model_outputs_and_labels
@@ -67,9 +75,15 @@ def compute_metrics(
         ):
             metric.compute_kwargs["device"] = benchmark_config.device.type
-        while True:
+        for _ in range(num_attempts := 5):
             try:
-                score: float | None = metric(predictions=predictions, references=labels)
+                score: float | None = metric(
+                    predictions=predictions,
+                    references=labels,
+                    dataset=dataset,
+                    dataset_config=dataset_config,
+                    benchmark_config=benchmark_config,
+                )
                 break
             except Exception as e:
                 oom_error = [
@@ -78,28 +92,35 @@ def compute_metrics(
                     "MPS backend out of memory",
                 ]
                 if not any(error in str(e) for error in oom_error):
-                    raise InvalidBenchmark(str(e))
+                    raise InvalidBenchmark(str(e)) from e
                 if (
                     isinstance(metric, HuggingFaceMetric)
                     and metric.compute_kwargs.get("device", "cpu") != "cpu"
                 ):
                     metric.compute_kwargs["device"] = "cpu"
-                    logger.debug(
+                    log(
                         "Out of memory error occurred during the computation of "
                         f"the metric {metric.pretty_name}. Moving the computation to "
-                        "the CPU."
+                        "the CPU.",
+                        level=logging.DEBUG,
                     )
                 else:
-                    raise InvalidBenchmark(str(e))
+                    raise InvalidBenchmark(str(e)) from e
             finally:
                 for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
                     if hasattr(metric, attribute):
-                        logger.debug(
+                        log(
                             f"Deleting the {attribute!r} attribute of the metric "
-                            f"{metric.pretty_name} to free up memory."
+                            f"{metric.pretty_name} to free up memory.",
+                            level=logging.DEBUG,
                         )
                         delattr(metric, attribute)
+        else:
+            raise InvalidBenchmark(
+                f"Could not compute the metric {metric.pretty_name} after "
+                f"{num_attempts} attempts due to out of memory errors."
+            )
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process
@@ -111,7 +132,7 @@ def compute_metrics(
 def extract_labels_from_generation(
     input_batch: dict[str, list], model_output: "GenerativeModelOutput"
-) -> list[t.Any]:
+) -> c.Sequence[t.Any]:
     """Extract the predicted labels from the generated output.
     Args:

EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl