PyPI - EuroEval - Versions diffs - 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl - Mend

EuroEval 15.16.0py3-none-any.whl → 16.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show

euroeval/__init__.py +3 -7
euroeval/benchmark_config_factory.py +3 -7
euroeval/benchmark_modules/base.py +35 -19
euroeval/benchmark_modules/fresh.py +24 -19
euroeval/benchmark_modules/hf.py +136 -154
euroeval/benchmark_modules/litellm.py +190 -110
euroeval/benchmark_modules/vllm.py +161 -114
euroeval/benchmarker.py +49 -22
euroeval/cli.py +3 -3
euroeval/constants.py +13 -15
euroeval/data_loading.py +33 -28
euroeval/data_models.py +53 -7
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/danish.py +38 -1
euroeval/dataset_configs/dutch.py +38 -1
euroeval/dataset_configs/english.py +38 -1
euroeval/dataset_configs/estonian.py +95 -0
euroeval/dataset_configs/faroese.py +38 -0
euroeval/dataset_configs/finnish.py +39 -1
euroeval/dataset_configs/french.py +38 -1
euroeval/dataset_configs/german.py +38 -1
euroeval/dataset_configs/icelandic.py +39 -1
euroeval/dataset_configs/italian.py +38 -1
euroeval/dataset_configs/latvian.py +81 -0
euroeval/dataset_configs/norwegian.py +38 -1
euroeval/dataset_configs/portuguese.py +38 -1
euroeval/dataset_configs/spanish.py +38 -1
euroeval/dataset_configs/swedish.py +38 -1
euroeval/enums.py +0 -6
euroeval/finetuning.py +6 -6
euroeval/generation.py +25 -14
euroeval/generation_utils.py +46 -14
euroeval/languages.py +947 -187
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +76 -0
euroeval/metrics/huggingface.py +192 -0
euroeval/metrics/llm_as_a_judge.py +257 -0
euroeval/metrics/pipeline.py +234 -0
euroeval/metrics/speed.py +51 -0
euroeval/prompt_templates/linguistic_acceptability.py +40 -2
euroeval/prompt_templates/multiple_choice.py +23 -2
euroeval/prompt_templates/named_entity_recognition.py +65 -2
euroeval/prompt_templates/reading_comprehension.py +42 -2
euroeval/prompt_templates/sentiment_classification.py +46 -2
euroeval/prompt_templates/summarization.py +24 -4
euroeval/scores.py +7 -2
euroeval/speed_benchmark.py +6 -6
euroeval/task_group_utils/multiple_choice_classification.py +17 -6
euroeval/task_group_utils/question_answering.py +35 -28
euroeval/task_group_utils/sequence_classification.py +96 -23
euroeval/task_group_utils/text_to_text.py +7 -3
euroeval/task_group_utils/token_classification.py +47 -75
euroeval/tasks.py +31 -6
euroeval/tokenization_utils.py +295 -207
euroeval/utils.py +118 -34
{euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
euroeval-16.0.0.dist-info/RECORD +69 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -738
euroeval/metrics.py +0 -470
euroeval-15.16.0.dist-info/RECORD +0 -63
{euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0

euroeval/scores.py CHANGED Viewed

@@ -52,7 +52,12 @@ def log_scores(
         test_se, test_se_str = metric.postprocessing_fn(test_se)
         total_dict[f"test_{metric.name}"] = test_score
         total_dict[f"test_{metric.name}_se"] = test_se
-        logger.info(f"{metric.pretty_name}: {test_score_str} ± {test_se_str}")
+        log_str = (
+            f"{metric.pretty_name}: {test_score_str} ± {test_se_str}"
+            if not np.isnan(test_se)
+            else f"{metric.pretty_name}: {test_score_str}"
+        )
+        logger.info(log_str)
     return dict(raw=scores, total=total_dict)
@@ -84,7 +89,7 @@ def aggregate_scores(
         if len(test_scores) > 1:
             sample_std = np.std(test_scores, ddof=1)
-            test_se = sample_std / np.sqrt(len(test_scores))
+            test_se = (sample_std / np.sqrt(len(test_scores))).item()
         else:
             test_se = np.nan

euroeval/speed_benchmark.py CHANGED Viewed

@@ -59,7 +59,7 @@ def benchmark_speed_single_iteration(
     Returns:
         A dictionary containing the scores for the current iteration.
     """
-    gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
+    gpt2_tokeniser = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
     base_doc = "Document which contains roughly 10 tokens. "
     multiplier = 10 * (1 + itr_idx)
@@ -74,11 +74,11 @@ def benchmark_speed_single_iteration(
         model.generate(inputs=dict(text=[doc]))
     def encoder_predict(doc: str) -> None:
-        tokenizer = model.get_tokenizer()
+        tokeniser = model.get_tokeniser()
         pytorch_model = model.get_pytorch_module()
         inputs = {
             key: tensor.to(pytorch_model.device)
-            for key, tensor in tokenizer(
+            for key, tensor in tokeniser(
                 text=[doc], truncation=True, return_tensors="pt"
             ).items()
         }
@@ -102,21 +102,21 @@ def benchmark_speed_single_iteration(
         speed_scores = pyinfer.InferenceReport(
             model=predict, inputs=doc, n_seconds=3
         ).run(print_report=False)
-        num_gpt2_tokens = len(gpt2_tokenizer([doc], truncation=True)["input_ids"][0])
+        num_gpt2_tokens = len(gpt2_tokeniser([doc], truncation=True)["input_ids"][0])
         gpt2_tokens_per_second = speed_scores["Infer(p/sec)"] * num_gpt2_tokens
         speed_scores_short = pyinfer.InferenceReport(
             model=predict, inputs=short_doc, n_seconds=3
         ).run(print_report=False)
         num_gpt2_tokens_short = len(
-            gpt2_tokenizer([short_doc], truncation=True)["input_ids"][0]
+            gpt2_tokeniser([short_doc], truncation=True)["input_ids"][0]
         )
         gpt2_tokens_per_second_short = (
             speed_scores_short["Infer(p/sec)"] * num_gpt2_tokens_short
         )
     except (RuntimeError, ValueError, IndexError) as e:
-        raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}")
+        raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}") from e
     return dict(
         test_speed=gpt2_tokens_per_second, test_speed_short=gpt2_tokens_per_second_short

euroeval/task_group_utils/multiple_choice_classification.py CHANGED Viewed

@@ -94,15 +94,15 @@ class MultipleChoiceClassificationTrainer(Trainer):
 def prepare_examples(
-    examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
+    examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
 ) -> "BatchEncoding":
     """Prepare the features.
     Args:
         examples:
             The examples to prepare.
-        tokenizer:
-            The tokenizer to use to prepare the examples.
+        tokeniser:
+            The tokeniser to use to prepare the examples.
     Returns:
         The prepared examples.
@@ -110,11 +110,22 @@ def prepare_examples(
     doc: str = examples["text"][0]
     sections = doc.split("\n")
-    choice_idxs = [
+    candidate_choice_idxs = [
         idx
         for idx, section in enumerate(sections)
-        if re.match(pattern=r"^[a-e]\. ", string=section) is not None
+        if re.match(pattern=r"^[a-z0-9]+\. ", string=section) is not None
     ]
+    # Sometimes the question itself starts with a letter or number followed by a dot, We
+    # want to ignore these cases, and focus on the final contingent block of at least
+    # two choices.
+    choice_idxs: list[int] = list()
+    for idx in reversed(candidate_choice_idxs):
+        if len(choice_idxs) < 2 or (
+            len(choice_idxs) >= 2 and idx == choice_idxs[-1] - 1
+        ):
+            choice_idxs.append(idx)
     choices = [sections[idx] for idx in choice_idxs]
     # Check that the choices are present, and that all of them are at the end
@@ -127,7 +138,7 @@ def prepare_examples(
     question_idx = min(choice_idxs) - 2  # -2 to remove the 'Choices:' line
     context_and_question = "\n".join(sections[: question_idx + 1]).strip()
-    new_examples = tokenizer(
+    new_examples = tokeniser(
         text=[context_and_question] * len(choices),
         text_pair=[choice[3:] for choice in choices],
         padding=True,

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -23,7 +23,7 @@ if t.TYPE_CHECKING:
     from transformers.trainer_utils import EvalPrediction
     from transformers.training_args import TrainingArguments
-    from ..data_models import DatasetConfig, GenerativeModelOutput
+    from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
 logger = logging.getLogger("euroeval")
@@ -57,7 +57,7 @@ class QuestionAnsweringTrainer(Trainer):
             **kwargs,
         )
-        # Get the CLS token id for the tokenizer
+        # Get the CLS token id for the tokeniser
         if self.tokenizer is not None:
             assert isinstance(self.tokenizer, PreTrainedTokenizerBase)
             special_token_metadata = get_special_token_metadata(self.tokenizer)
@@ -149,6 +149,7 @@ class QuestionAnsweringTrainer(Trainer):
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
     dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -159,6 +160,8 @@ def compute_metrics(
             contains the true labels.
         dataset_config:
             The configuration of the dataset.
+        benchmark_config:
+            The configuration of the benchmark.
         dataset:
             The dataset used for evaluation. This is only used in case any additional
             metadata is used to compute the metrics.
@@ -186,7 +189,11 @@ def compute_metrics(
     results: dict[str, float] = dict()
     for metric in dataset_config.task.metrics:
         score: float | None = metric(
-            predictions=predictions, references=labels, dataset=dataset
+            predictions=predictions,
+            references=labels,
+            dataset=dataset,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
         )
         # The metric returns None if we are running on multi-GPU and the current
@@ -221,15 +228,15 @@ def extract_labels_from_generation(
 def prepare_train_examples(
-    examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
+    examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
 ) -> "BatchEncoding":
     """Prepare the features for training.
     Args:
         examples:
             The examples to prepare.
-        tokenizer:
-            The tokenizer to use to prepare the examples.
+        tokeniser:
+            The tokeniser to use to prepare the examples.
     Returns:
         The prepared examples.
@@ -239,15 +246,15 @@ def prepare_train_examples(
     # take a lots of space). So we remove that left whitespace
     examples["question"] = [q.lstrip() for q in examples["question"]]
-    # Extract special token metadata from the tokenizer
-    special_token_metadata = get_special_token_metadata(tokenizer=tokenizer)
+    # Extract special token metadata from the tokeniser
+    special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
     has_cls_token = special_token_metadata["has_cls_token"]
     has_sep_token = special_token_metadata["has_sep_token"]
     cls_token_id = special_token_metadata["cls_token_id"]
     cls_token = special_token_metadata["cls_token"]
     sep_token = special_token_metadata["sep_token"]
-    # If the tokenizer is not adding special tokens, then we add them manually
+    # If the tokeniser is not adding special tokens, then we add them manually
     if not has_cls_token and not has_sep_token:
         examples["question"] = [
             f"{cls_token}{q}{sep_token}" for q in examples["question"]
@@ -258,18 +265,18 @@ def prepare_train_examples(
     # split into several features. Since we are always keeping the question tokens, we
     # need to make sure that the stride does not exceed the resulting maximum context
     # length.
-    max_question_tokens = max(len(tokenizer(q).input_ids) for q in examples["question"])
+    max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
     num_special_tokens = int(has_cls_token) + int(has_sep_token)
-    stride = tokenizer.model_max_length // 4
-    max_length = tokenizer.model_max_length - stride
+    stride = tokeniser.model_max_length // 4
+    max_length = tokeniser.model_max_length - stride
     stride = min(stride, max_length - max_question_tokens - num_special_tokens)
-    max_length = tokenizer.model_max_length - stride
+    max_length = tokeniser.model_max_length - stride
     # Tokenize our examples with truncation and padding, but keep the overflows using a
     # stride. This results in one example possible giving several features when a
     # context is long, each of those features having a context that overlaps a bit the
     # context of the previous feature.
-    tokenized_examples = tokenizer(
+    tokenized_examples = tokeniser(
         text=examples["question"],
         text_pair=examples["context"],
         truncation="only_second",
@@ -306,9 +313,9 @@ def prepare_train_examples(
         sequence_ids = tokenized_examples.sequence_ids(i)
         # Manually ensure that the special tokens are set to None in `sequence_ids`
-        for special_token in tokenizer.special_tokens_map.keys():
-            if hasattr(tokenizer, f"{special_token}_id"):
-                special_token_id = getattr(tokenizer, f"{special_token}_id")
+        for special_token in tokeniser.special_tokens_map.keys():
+            if hasattr(tokeniser, f"{special_token}_id"):
+                special_token_id = getattr(tokeniser, f"{special_token}_id")
                 if special_token_id is not None:
                     sequence_ids = [
                         None if token_id == special_token_id else seq_id
@@ -373,15 +380,15 @@ def prepare_train_examples(
 def prepare_test_examples(
-    examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
+    examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
 ) -> "BatchEncoding":
     """Prepare test examples.
     Args:
         examples:
             Dictionary of test examples.
-        tokenizer:
-            The tokenizer used to preprocess the examples.
+        tokeniser:
+            The tokeniser used to preprocess the examples.
     Returns:
         The prepared test examples.
@@ -391,14 +398,14 @@ def prepare_test_examples(
     # take a lots of space). So we remove that left whitespace
     examples["question"] = [q.lstrip() for q in examples["question"]]
-    # Extract special token metadata from the tokenizer
-    special_token_metadata = get_special_token_metadata(tokenizer=tokenizer)
+    # Extract special token metadata from the tokeniser
+    special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
     has_cls_token = special_token_metadata["has_cls_token"]
     has_sep_token = special_token_metadata["has_sep_token"]
     cls_token = special_token_metadata["cls_token"]
     sep_token = special_token_metadata["sep_token"]
-    # If the tokenizer is not adding special tokens, then we add them manually
+    # If the tokeniser is not adding special tokens, then we add them manually
     if not has_cls_token and not has_sep_token:
         examples["question"] = [
             f"{cls_token}{q}{sep_token}" for q in examples["question"]
@@ -409,18 +416,18 @@ def prepare_test_examples(
     # split into several features. Since we are always keeping the question tokens, we
     # need to make sure that the stride does not exceed the resulting maximum context
     # length.
-    max_question_tokens = max(len(tokenizer(q).input_ids) for q in examples["question"])
+    max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
     num_special_tokens = int(has_cls_token) + int(has_sep_token)
-    stride = tokenizer.model_max_length // 4
-    max_length = tokenizer.model_max_length - stride
+    stride = tokeniser.model_max_length // 4
+    max_length = tokeniser.model_max_length - stride
     stride = min(stride, max_length - max_question_tokens - num_special_tokens)
-    max_length = tokenizer.model_max_length - stride
+    max_length = tokeniser.model_max_length - stride
     # Tokenize our examples with truncation and maybe padding, but keep the overflows
     # using a stride. This results in one example possible giving several features when
     # a context is long, each of those features having a context that overlaps a bit
     # the context of the previous feature.
-    tokenized_examples = tokenizer(
+    tokenized_examples = tokeniser(
         text=examples["question"],
         text_pair=examples["context"],
         truncation="only_second",

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -7,6 +7,7 @@ import typing as t
 import Levenshtein
 import numpy as np
+from ..enums import TaskGroup
 from ..exceptions import InvalidBenchmark
 from ..utils import log_once, raise_if_model_output_contains_nan_values
@@ -14,7 +15,7 @@ if t.TYPE_CHECKING:
     from datasets.arrow_dataset import Dataset
     from transformers.trainer_utils import EvalPrediction
-    from ..data_models import DatasetConfig, GenerativeModelOutput
+    from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
@@ -24,6 +25,7 @@ logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
     dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -34,6 +36,8 @@ def compute_metrics(
             contains the true labels.
         dataset_config:
             The configuration of the dataset.
+        benchmark_config:
+            The configuration of the benchmark.
         dataset:
             The dataset used for evaluation. This is only used in case any additional
             metadata is used to compute the metrics.
@@ -79,7 +83,11 @@ def compute_metrics(
     results: dict[str, float] = dict()
     for metric in dataset_config.task.metrics:
         score: float | None = metric(
-            predictions=predictions, references=label_ids, dataset=dataset
+            predictions=predictions,
+            references=label_ids,
+            dataset=dataset,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
         )
         # The metric returns None if we are running on multi-GPU and the current
@@ -113,6 +121,12 @@ def extract_labels_from_generation(
     Returns:
         The predicted labels.
+    Raises:
+        InvalidBenchmark:
+            If the task requires log probabilities, but the model did not output them,
+            or if the model outputted log probabilities but the first label token
+            mapping is not provided.
     """
     if model_output.scores is not None:
         if first_label_token_mapping is False:
@@ -127,25 +141,74 @@ def extract_labels_from_generation(
         )
         if labels is not None:
             return labels
+        elif dataset_config.task.requires_logprobs:
+            raise InvalidBenchmark(
+                "This task requires the model to output logprobs, and this model "
+                "does not seem to be able to do that. Skipping the evaluation."
+            )
+    # Get the candidate labels, which are the labels that the model can predict
     candidate_labels = [
         dataset_config.prompt_label_mapping[lbl]
         for lbl in dataset_config.id2label.values()
     ]
     new_predicted_labels: list[str] = list()
-    for predicted_label in model_output.sequences:
+    for idx, predicted_label in enumerate(model_output.sequences):
+        # Special case if we are doing multiple choice classification: we in this case
+        # dynamically change the candidate labels to the labels mentioned in the prompt
+        if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
+            prompt = input_batch["text"][idx]
+            sample_candidate_labels = [
+                candidate_label
+                for candidate_label in candidate_labels
+                if re.search(
+                    pattern=rf"\b{candidate_label}. ",
+                    string=prompt,
+                    flags=re.IGNORECASE,
+                )
+                is not None
+            ]
+        else:
+            sample_candidate_labels = candidate_labels
         # If the prediction includes a boxed answer, use that instead of the full
         # generation
         if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
             predicted_label = m.group(1)
-        # Pick the label with the smallest word edit distance to the predicted label
+        # We set the word edit distance weights such that we heavily penalise insertions
+        # and substitutions, so that we don't just insert the correct label, but that we
+        # want the model to have included the correct label in its output.
+        insertion_weight = 1000
+        deletion_weight = 1
+        substitution_weight = 1000
+        # Compute the word edit distances between the predicted label and all candidate
+        # labels
         edit_distances = [
-            Levenshtein.distance(s1=predicted_label.lower(), s2=candidate_label.lower())
-            for candidate_label in candidate_labels
+            Levenshtein.distance(
+                s1=predicted_label.lower(),
+                s2=candidate_label.lower(),
+                weights=(insertion_weight, deletion_weight, substitution_weight),
+            )
+            for candidate_label in sample_candidate_labels
         ]
-        predicted_label = candidate_labels[np.argmin(edit_distances).item()]
-        new_predicted_labels.append(predicted_label)
+        # If no candidate labels were found, we assume that something is wrong with the
+        # model output, and we raise an error
+        if min(edit_distances) > 100:
+            raise InvalidBenchmark(
+                "No candidate labels found for the predicted label "
+                f"{predicted_label!r}, out of the candidate labels "
+                f"{sample_candidate_labels}. This likely means that the model output "
+                "is completely off, and we cannot extract any labels from it. Please "
+                "check the model output and the candidate labels."
+            )
+        # Pick the label with the smallest word edit distance to the predicted label
+        best_candidate_label = sample_candidate_labels[np.argmin(edit_distances).item()]
+        new_predicted_labels.append(best_candidate_label)
     return new_predicted_labels
@@ -187,11 +250,7 @@ def get_closest_logprobs_labels(
     for sample in generation_logprobs:
         for logprob_list in sample:
             generated_labels = [
-                re.sub(
-                    pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
-                    repl="",
-                    string=label.lower(),
-                )
+                re.sub(pattern=r"^[^a-zæøåüöä0-9]+$", repl="", string=label.lower())
                 for label, _ in logprob_list
             ]
             generated_labels = [label for label in generated_labels if label != ""]
@@ -227,6 +286,18 @@ def get_closest_logprobs_labels(
                         if candidate_label.startswith(generated_label)
                     }
+                # If the generated label is a numeral (e.g., "1", "2", "3") and there is
+                # a matching candidate label, we only keep the full match
+                if re.match(r"^\d+$", generated_label) and any(
+                    candidate_label == generated_label
+                    for candidate_label in candidate_output_labels
+                ):
+                    candidate_output_labels = {
+                        candidate_label
+                        for candidate_label in candidate_output_labels
+                        if candidate_label == generated_label
+                    }
                 # If we can uniquely determine the output label, we break the loop.
                 if len(candidate_output_labels) == 1:
                     output_label = candidate_output_labels.pop()
@@ -263,10 +334,12 @@ def get_closest_logprobs_labels(
                     if candidate_output_labels_starting_with_generated_label:
                         log_once(
                             f"No candidate label found for the generated label "
-                            f"{generated_label!r}. This means that using logprobs to "
-                            "extract the labels is not reliable, and we will instead "
-                            "fall back to extracting the labels using word edit "
-                            "distance.",
+                            f"{generated_label!r}, but there are candidate labels "
+                            f"starting with it: "
+                            f"{candidate_output_labels_starting_with_generated_label}. "
+                            "This means that the first label token mapping is not "
+                            "reliable, and we will instead fall back to extracting "
+                            "the labels using word edit distance.",
                             level=logging.DEBUG,
                         )
                         return None
@@ -291,16 +364,16 @@ def get_closest_logprobs_labels(
             if len(sample) == 0:
                 log_once(
                     "The model outputted an empty string, so no candidate labels could "
-                    f"be determined. Using {candidate_labels[0]!r} as the output "
-                    "label.",
-                    level=logging.DEBUG,
+                    f"be determined. Using the first label, {candidate_labels[0]!r}, "
+                    "as the output label.",
+                    level=logging.INFO,
                 )
             else:
                 log_once(
                     "Could not find a candidate label for any of the generated "
-                    f"labels in the sample {sample}. Using {candidate_labels[0]!r} "
-                    "as the output label.",
-                    level=logging.DEBUG,
+                    f"labels in the sample {sample}. Using the first label, "
+                    f"{candidate_labels[0]!r}, as the output label.",
+                    level=logging.INFO,
                 )
             output_labels.append(candidate_labels[0])

euroeval/task_group_utils/text_to_text.py CHANGED Viewed

@@ -75,7 +75,11 @@ def compute_metrics(
         while True:
             try:
                 score: float | None = metric(
-                    predictions=predictions, references=labels, dataset=dataset
+                    predictions=predictions,
+                    references=labels,
+                    dataset=dataset,
+                    dataset_config=dataset_config,
+                    benchmark_config=benchmark_config,
                 )
                 break
             except Exception as e:
@@ -85,7 +89,7 @@ def compute_metrics(
                     "MPS backend out of memory",
                 ]
                 if not any(error in str(e) for error in oom_error):
-                    raise InvalidBenchmark(str(e))
+                    raise InvalidBenchmark(str(e)) from e
                 if (
                     isinstance(metric, HuggingFaceMetric)
@@ -98,7 +102,7 @@ def compute_metrics(
                         "the CPU."
                     )
                 else:
-                    raise InvalidBenchmark(str(e))
+                    raise InvalidBenchmark(str(e)) from e
             finally:
                 for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
                     if hasattr(metric, attribute):

EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.16.0py3-none-any.whl → 16.0.0py3-none-any.whl