PyPI - EuroEval - Versions diffs - 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl - Mend

EuroEval 16.3.0py3-none-any.whl → 16.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show

euroeval/__init__.py +3 -2
euroeval/benchmark_config_factory.py +0 -4
euroeval/benchmark_modules/base.py +3 -16
euroeval/benchmark_modules/fresh.py +2 -1
euroeval/benchmark_modules/hf.py +99 -62
euroeval/benchmark_modules/litellm.py +101 -41
euroeval/benchmark_modules/vllm.py +91 -83
euroeval/benchmarker.py +84 -78
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/constants.py +6 -0
euroeval/data_loading.py +14 -11
euroeval/data_models.py +12 -4
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/czech.py +79 -0
euroeval/dataset_configs/danish.py +10 -11
euroeval/dataset_configs/dutch.py +0 -1
euroeval/dataset_configs/english.py +0 -1
euroeval/dataset_configs/estonian.py +11 -1
euroeval/dataset_configs/finnish.py +0 -1
euroeval/dataset_configs/french.py +0 -1
euroeval/dataset_configs/german.py +0 -1
euroeval/dataset_configs/italian.py +0 -1
euroeval/dataset_configs/latvian.py +0 -1
euroeval/dataset_configs/lithuanian.py +9 -3
euroeval/dataset_configs/norwegian.py +0 -1
euroeval/dataset_configs/polish.py +0 -1
euroeval/dataset_configs/portuguese.py +0 -1
euroeval/dataset_configs/slovak.py +60 -0
euroeval/dataset_configs/spanish.py +0 -1
euroeval/dataset_configs/swedish.py +10 -12
euroeval/finetuning.py +21 -15
euroeval/generation.py +10 -10
euroeval/generation_utils.py +2 -3
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +9 -5
euroeval/metrics/llm_as_a_judge.py +5 -3
euroeval/metrics/pipeline.py +17 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +11 -14
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/linguistic_acceptability.py +21 -3
euroeval/prompt_templates/multiple_choice.py +25 -1
euroeval/prompt_templates/named_entity_recognition.py +51 -11
euroeval/prompt_templates/reading_comprehension.py +31 -3
euroeval/prompt_templates/sentiment_classification.py +23 -1
euroeval/prompt_templates/summarization.py +26 -6
euroeval/scores.py +7 -7
euroeval/speed_benchmark.py +3 -5
euroeval/task_group_utils/multiple_choice_classification.py +0 -3
euroeval/task_group_utils/question_answering.py +0 -3
euroeval/task_group_utils/sequence_classification.py +43 -31
euroeval/task_group_utils/text_to_text.py +17 -8
euroeval/task_group_utils/token_classification.py +10 -9
euroeval/tokenisation_utils.py +14 -12
euroeval/utils.py +29 -146
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/METADATA +4 -4
euroeval-16.4.0.dist-info/RECORD +75 -0
euroeval-16.3.0.dist-info/RECORD +0 -71
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0

euroeval/prompt_templates/named_entity_recognition.py CHANGED Viewed

@@ -4,6 +4,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
+    CS,
     DA,
     DE,
     EN,
@@ -22,6 +23,7 @@ from ..languages import (
     NO,
     PL,
     PT,
+    SK,
     SV,
 )
@@ -30,6 +32,25 @@ if t.TYPE_CHECKING:
 NER_TEMPLATES: dict["Language", PromptConfig] = {
+    CS: PromptConfig(
+        default_prompt_label_mapping={
+            "b-per": "osoba",
+            "i-per": "osoba",
+            "b-loc": "místo",
+            "i-loc": "místo",
+            "b-org": "organizace",
+            "i-org": "organizace",
+            "b-misc": "různé",
+            "i-misc": "různé",
+        },
+        default_prompt_prefix="Následující jsou věty a JSON slovníky s pojmenovanými "
+        "entitami, které se v dané větě vyskytují.",
+        default_prompt_template="Věta: {text}\nPojmenované entity: {label}",
+        default_instruction_prompt="Věta: {text}\n\nIdentifikujte pojmenované entity "
+        "ve větě. Měli byste to vypsat jako JSON slovník s klíči {labels_str}. "
+        "Hodnoty by měly být seznamy pojmenovaných entit tohoto typu, přesně tak, "
+        "jak se objevují ve větě.",
+    ),
     DA: PromptConfig(
         default_prompt_label_mapping={
             "b-per": "person",
@@ -361,20 +382,39 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
         default_prompt_label_mapping={
             "b-per": "osoba",
             "i-per": "osoba",
-            "b-loc": "lokalizacja",
-            "i-loc": "lokalizacja",
+            "b-loc": "miejsce",
+            "i-loc": "miejsce",
             "b-org": "organizacja",
             "i-org": "organizacja",
-            "b-misc": "różne",
-            "i-misc": "różne",
+            "b-misc": "inne",
+            "i-misc": "inne",
+        },
+        default_prompt_prefix="Poniżej znajdują się zdania i słowniki JSON "
+        "z jednostkami nazewniczymi, które występują w danym zdaniu.",
+        default_prompt_template="Zdanie: {text}\nJednostki nazewnicze: {label}",
+        default_instruction_prompt="Zdanie: {text}\n\nZidentyfikuj jednostki "
+        "nazewnicze w zdaniu. Wypisz je jako słownik JSON z kluczami "
+        "{labels_str}. Wartości odpowiadające kluczom powinny być listami jednostek "
+        "nazewniczych danego typu, dokładnie tak, jak pojawiają się w zdaniu.",
+    ),
+    SK: PromptConfig(
+        default_prompt_label_mapping={
+            "b-per": "osoba",
+            "i-per": "osoba",
+            "b-loc": "miesto",
+            "i-loc": "miesto",
+            "b-org": "organizácia",
+            "i-org": "organizácia",
+            "b-misc": "rôzne",
+            "i-misc": "rôzne",
         },
-        default_prompt_prefix="Poniżej znajdują się zdania i słowniki JSON z nazwanymi "
-        "jednostkami występującymi w danym zdaniu.",
-        default_prompt_template="Zdanie: {text}\nNazwane jednostki: {label}",
-        default_instruction_prompt="Zdanie: {text}\n\nZidentyfikuj nazwane jednostki "
-        "w zdaniu. Powinieneś wypisać to jako słownik JSON z kluczami "
-        "{labels_str}. Wartości powinny być listami nazwanych jednostek "
-        "tego typu, dokładnie tak jak pojawiają się w zdaniu.",
+        default_prompt_prefix="Nasledujúce sú vety a JSON-objekty s pomenovanými "
+        "entitami, ktoré sa nachádzajú v danej vete.",
+        default_prompt_template="Veta: {text}\nPomenované entity: {label}",
+        default_instruction_prompt="Veta: {text}\n\nIdentifikujte pomenované "
+        "entity vo vete. Výstup by mal byť vo forme JSON-objektu s kľúčmi "
+        "{labels_str}. Hodnoty by mali byť zoznamy pomenovaných entít danej "
+        "kategórie, presne tak, ako sa vyskytujú vo vete.",
     ),
     SV: PromptConfig(
         default_prompt_label_mapping={

euroeval/prompt_templates/reading_comprehension.py CHANGED Viewed

@@ -4,6 +4,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
+    CS,
     DA,
     DE,
     EN,
@@ -22,6 +23,7 @@ from ..languages import (
     NO,
     PL,
     PT,
+    SK,
     SV,
 )
@@ -29,6 +31,19 @@ if t.TYPE_CHECKING:
     from ..data_models import Language
 RC_TEMPLATES: dict["Language", PromptConfig] = {
+    CS: PromptConfig(
+        default_prompt_prefix="Následující texty obsahují otázky a odpovědi.",
+        default_prompt_template=(
+            "Text: {text}\nOtázka: {question}\nOdpověď maximálně 3 slovy: {label}"
+        ),
+        default_instruction_prompt=(
+            "Text: {text}\n\n"
+            "Odpovězte na následující otázku k výše uvedenému textu "
+            "maximálně 3 slovy.\n\n"
+            "Otázka: {question}"
+        ),
+        default_prompt_label_mapping=dict(),
+    ),
     DA: PromptConfig(
         default_prompt_prefix="Følgende er tekster med tilhørende spørgsmål og svar.",
         default_prompt_template="Tekst: {text}\nSpørgsmål: {question}\nSvar med maks. "
@@ -172,10 +187,11 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
         default_prompt_prefix=(
             "Poniżej znajdują się teksty z towarzyszącymi pytaniami i odpowiedziami."
         ),
-        default_prompt_template="Tekst: {text}\nPytanie: {question}\nOdpowiedź w "
-        "maksymalnie 3 słowach: {label}",
+        default_prompt_template="Tekst: {text}\nPytanie: {question}\nOdpowiedź z "
+        "użyciem maksymalnie 3 słów: {label}",
         default_instruction_prompt="Tekst: {text}\n\nOdpowiedz na następujące pytanie "
-        "dotyczące powyższego tekstu w maksymalnie 3 słowach.\n\nPytanie: {question}",
+        "dotyczące powyższego tekstu, używając maksymalnie 3 słów.\n\nPytanie: "
+        "{question}",
         default_prompt_label_mapping=dict(),
     ),
     PT: PromptConfig(
@@ -187,6 +203,18 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
         "sobre o texto acima num máximo de 3 palavras.\n\nPergunta: {question}",
         default_prompt_label_mapping=dict(),
     ),
+    SK: PromptConfig(
+        default_prompt_prefix=("Nasledujú texty s pridruženými otázkami a odpoveďami."),
+        default_prompt_template=(
+            "Text: {text}\nOtázka: {question}\nOdpoveď na maximálne 3 slová: {label}"
+        ),
+        default_instruction_prompt=(
+            "Text: {text}\n\n"
+            "Odpovedzte na nasledujúcu otázku týkajúcu sa textu uvedeného vyššie "
+            "maximálne 3 slovami.\n\nOtázka: {question}"
+        ),
+        default_prompt_label_mapping=dict(),
+    ),
     SV: PromptConfig(
         default_prompt_prefix="Nedan följer texter med tillhörande frågor och svar.",
         default_prompt_template="Text: {text}\nFråga: {question}\nSvar på max 3 ord: "

euroeval/prompt_templates/sentiment_classification.py CHANGED Viewed

@@ -4,6 +4,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
+    CS,
     DA,
     DE,
     EN,
@@ -22,6 +23,7 @@ from ..languages import (
     NO,
     PL,
     PT,
+    SK,
     SV,
 )
@@ -39,6 +41,16 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
         default_instruction_prompt="Dokument: {text}\n\nKlassificer sentimentet i "
         "dokumentet. Svar kun med {labels_str}, og intet andet.",
     ),
+    CS: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="pozitivní", neutral="neutrální", negative="negativní"
+        ),
+        default_prompt_prefix="Následují dokumenty a jejich sentiment, který může být "
+        "{labels_str}.",
+        default_prompt_template="Dokument: {text}\nSentiment: {label}",
+        default_instruction_prompt="Dokument: {text}\n\nKlasifikujte sentiment v "
+        "dokumentu. Odpovězte pouze s {labels_str}, a nic jiného.",
+    ),
     DE: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="positiv", neutral="neutral", negative="negativ"
@@ -91,7 +103,7 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
         default_prompt_template="Dokument: {text}\nSentyment: {label}",
         default_instruction_prompt=(
             "Dokument: {text}\n\nKlasyfikuj sentyment w dokumencie. "
-            "Odpowiedz z {labels_str}, i nic więcej."
+            "Odpowiedz jednym słowem: {labels_str}."
         ),
     ),
     PT: PromptConfig(
@@ -214,6 +226,16 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
         default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
         "teksten. Svar med {labels_str}, og ikke noe annet.",
     ),
+    SK: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="pozitívne", neutral="neutrálne", negative="negatívne"
+        ),
+        default_prompt_prefix="Nižšie sú dokumenty a ich sentiment, ktorý môže byť "
+        "{labels_str}.",
+        default_prompt_template="Dokument: {text}\nSentiment: {label}",
+        default_instruction_prompt="Dokument: {text}\n\nKlasifikujte pocit v "
+        "dokumente. Odpovedzte so {labels_str}, a nič iné.",
+    ),
     SV: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="positiv", neutral="neutral", negative="negativ"

euroeval/prompt_templates/summarization.py CHANGED Viewed

@@ -4,6 +4,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
+    CS,
     DA,
     DE,
     EN,
@@ -13,6 +14,7 @@ from ..languages import (
     FR,
     IS,
     IT,
+    LT,
     LV,
     NB,
     NL,
@@ -28,6 +30,14 @@ if t.TYPE_CHECKING:
 # TODO: Missing Faroese
 SUMM_TEMPLATES: dict["Language", PromptConfig] = {
+    CS: PromptConfig(
+        default_prompt_prefix=("Následující jsou dokumenty s přiloženými souhrny."),
+        default_prompt_template=("Dokument: {text}\nSouhrn: {target_text}"),
+        default_instruction_prompt=(
+            "Dokument: {text}\n\nNapište souhrn výše uvedeného dokumentu."
+        ),
+        default_prompt_label_mapping=dict(),
+    ),
     DA: PromptConfig(
         default_prompt_prefix="Følgende er dokumenter med tilhørende resuméer.",
         default_prompt_template="Dokument: {text}\nResumé: {target_text}",
@@ -96,11 +106,14 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
         ),
         default_prompt_label_mapping=dict(),
     ),
-    IS: PromptConfig(
-        default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
-        default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",
-        default_instruction_prompt="Skjal: {text}\n\nSkrifaðu samantekt á ofangreindu "
-        "skjali.",
+    LT: PromptConfig(
+        default_prompt_prefix=(
+            "Žemiau pateikiami dokumentai su pridėtomis santraukomis."
+        ),
+        default_prompt_template=("Dokumentas: {text}\nSantrauka: {target_text}"),
+        default_instruction_prompt=(
+            "Dokumentas: {text}\n\nParašykite aukščiau pateikto dokumento santrauką."
+        ),
         default_prompt_label_mapping=dict(),
     ),
     IT: PromptConfig(
@@ -111,6 +124,13 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
         "documento di cui sopra.",
         default_prompt_label_mapping=dict(),
     ),
+    IS: PromptConfig(
+        default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
+        default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",
+        default_instruction_prompt="Skjal: {text}\n\nSkrifaðu samantekt á ofangreindu "
+        "skjali.",
+        default_prompt_label_mapping=dict(),
+    ),
     NB: PromptConfig(
         default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
         default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
@@ -142,7 +162,7 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
     ),
     PL: PromptConfig(
         default_prompt_prefix="Poniżej znajdują się artykuły z towarzyszącymi "
-        "streszczeniami.",
+        "im streszczeniami.",
         default_prompt_template="Artykuł: {text}\nStreszczenie: {target_text}",
         default_instruction_prompt="Artykuł: {text}\n\nNapisz streszczenie "
         "powyższego artykułu.",

euroeval/scores.py CHANGED Viewed

@@ -6,12 +6,12 @@ import warnings
 import numpy as np
+from .logging_utils import log
 if t.TYPE_CHECKING:
     from .metrics import Metric
     from .types import ScoreDict
-logger = logging.getLogger("euroeval")
 def log_scores(
     dataset_name: str,
@@ -48,9 +48,8 @@ def log_scores(
     if model_param is not None:
         model_id += f"#{model_param}"
-    logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
     total_dict: dict[str, float] = dict()
+    all_log_strs: list[str] = [f"Finished benchmarking {model_id} on {dataset_name}."]
     for metric in metrics:
         test_score, test_se = aggregate_scores(scores=scores, metric=metric)
         test_score, test_score_str = metric.postprocessing_fn(test_score)
@@ -58,11 +57,12 @@ def log_scores(
         total_dict[f"test_{metric.name}"] = test_score
         total_dict[f"test_{metric.name}_se"] = test_se
         log_str = (
-            f"{metric.pretty_name}: {test_score_str} ± {test_se_str}"
+            f"- {metric.pretty_name}: {test_score_str} ± {test_se_str}"
             if not np.isnan(test_se)
-            else f"{metric.pretty_name}: {test_score_str}"
+            else f"- {metric.pretty_name}: {test_score_str}"
         )
-        logger.info(log_str)
+        all_log_strs.append(log_str)
+    log("\n".join(all_log_strs), level=logging.INFO)
     return dict(raw=scores, total=total_dict)

euroeval/speed_benchmark.py CHANGED Viewed

@@ -4,19 +4,17 @@ import logging
 import typing as t
 import pyinfer
-from tqdm.auto import tqdm
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from .benchmark_modules import HuggingFaceEncoderModel, LiteLLMModel, VLLMModel
 from .exceptions import InvalidBenchmark
+from .logging_utils import get_pbar, log
 from .utils import clear_memory
 if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
     from .data_models import BenchmarkConfig
-logger = logging.getLogger("euroeval")
 def benchmark_speed(
     model: "BenchmarkModule", benchmark_config: "BenchmarkConfig"
@@ -33,7 +31,7 @@ def benchmark_speed(
         Dictionary of scores.
     """
     scores: list[dict[str, float]] = list()
-    for idx in tqdm(
+    for idx in get_pbar(
         iterable=range(benchmark_config.num_iterations),
         desc="Benchmarking",
         disable=not benchmark_config.progress_bar,
@@ -41,7 +39,7 @@ def benchmark_speed(
         itr_scores = benchmark_speed_single_iteration(model=model, itr_idx=idx)
         clear_memory()
         scores.append(itr_scores)
-        logger.debug(f"Scores for iteration {idx}: {itr_scores}")
+        log(f"Scores for iteration {idx}: {itr_scores}", level=logging.DEBUG)
     return scores

euroeval/task_group_utils/multiple_choice_classification.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Utility functions related to the multiple-choice classification task group."""
 import hashlib
-import logging
 import re
 import typing as t
 from collections import defaultdict
@@ -18,8 +17,6 @@ if t.TYPE_CHECKING:
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 class MultipleChoiceClassificationTrainer(Trainer):
     """Trainer subclass for multiple-choice classification tasks."""

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Utility functions related to the question-answering task group."""
 import collections.abc as c
-import logging
 import typing as t
 from collections import defaultdict
@@ -26,8 +25,6 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 class QuestionAnsweringTrainer(Trainer):
     """Trainer subclass for question answering tasks."""

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -19,13 +19,15 @@ if t.TYPE_CHECKING:
     from datasets.arrow_dataset import Dataset
     from transformers.trainer_utils import EvalPrediction
-    from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
+    from ..data_models import (
+        BenchmarkConfig,
+        DatasetConfig,
+        GenerativeModelOutput,
+        ModelConfig,
+    )
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
@@ -106,6 +108,7 @@ def extract_labels_from_generation(
     input_batch: dict[str, list],
     model_output: "GenerativeModelOutput",
     dataset_config: "DatasetConfig",
+    model_config: "ModelConfig",
     first_label_token_mapping: dict[str, str] | bool,
 ) -> list[str]:
     """Extract the predicted labels from the generated output.
@@ -118,6 +121,8 @@ def extract_labels_from_generation(
             The raw generated output of the model.
         dataset_config:
             The configuration of the dataset.
+        model_config:
+            The configuration of the model.
         first_label_token_mapping:
             A mapping from labels to the first token in each label, or alternatively a
             Boolean value indicating whether the model should output scores (if the
@@ -167,6 +172,7 @@ def extract_labels_from_generation(
             )
     new_predicted_labels: list[str] = list()
+    num_predictions_being_very_off = 0
     for idx, predicted_label in enumerate(model_output.sequences):
         # If the prediction includes a boxed answer, use that instead of the full
         # generation
@@ -199,34 +205,40 @@ def extract_labels_from_generation(
         # word edit distance to the predicted label (if invalid model outputs are
         # allowed), or we raise an error
         if min(edit_distances) >= 1000:
-            if dataset_config.allow_invalid_model_outputs:
-                logger.warning(
-                    "No candidate labels found for the predicted label "
-                    f"{predicted_label!r}, out of the candidate labels "
-                    f"{sample_candidate_labels[idx]}. This likely means that the model "
-                    "output is completely off, but since invalid model outputs are "
-                    "allowed for this task, we will use the closest candidate label "
-                    f"({best_candidate_label})) as the output label. If you see this "
-                    "warning very often, please report this issue to the EuroEval "
-                    "team at github.com/EuroEval/EuroEval/issues."
-                )
-                logger.debug(
-                    "The candidate labels were extracted from the prompt: "
-                    f"{input_batch['text'][idx]!r}."
-                )
-            else:
-                raise InvalidBenchmark(
-                    "No candidate labels found for the predicted label "
-                    f"{predicted_label!r}, out of the candidate labels "
-                    f"{sample_candidate_labels[idx]}. This likely means that the model "
-                    "output is completely off, and we cannot extract any labels from "
-                    "it. Please check the model output and the candidate labels. The "
-                    "candidate labels were extracted from the prompt: "
-                    f"{input_batch['text'][idx]!r}."
-                )
+            num_predictions_being_very_off += 1
         new_predicted_labels.append(best_candidate_label)
+    if num_predictions_being_very_off > 0:
+        if dataset_config.allow_invalid_model_outputs:
+            log_msg = (
+                "No candidate labels found for the predicted label in "
+                f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
+                f"of the samples with the model {model_config.model_id!r}. This "
+                "likely means that the model were completely off in these cases, "
+                "but since invalid model outputs are allowed for this task, we used "
+                "the closest candidate labels as the output labels."
+            )
+            level = logging.DEBUG
+            if num_predictions_being_very_off / len(model_output.sequences) > 0.5:
+                log_msg += (
+                    " Since this happened for most of the model's predictions, please "
+                    "report this issue to the EuroEval team at "
+                    "github.com/EuroEval/EuroEval/issues."
+                )
+                level = logging.WARNING
+            log_once(log_msg, level=level)
+        else:
+            raise InvalidBenchmark(
+                "No candidate labels found for the predicted label in "
+                f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
+                "of the samples. This likely means that the model were completely "
+                "off in these cases. Since this task does not allow invalid model "
+                "outputs, we have to abort the evaluation. Please re-run the "
+                "evaluation with the `--debug` flag (or `debug=True` if you're using "
+                "the `Benchmarker` API) to see the precise model outputs."
+            )
     return new_predicted_labels
@@ -355,7 +367,7 @@ def get_closest_logprobs_labels(
                     "be determined. This means that using logprobs to extract the "
                     "labels is not reliable, and we will instead fall back to "
                     "extracting the labels using word edit distance.",
-                    level=logging.INFO,
+                    level=logging.DEBUG,
                 )
             else:
                 log_once(
@@ -363,7 +375,7 @@ def get_closest_logprobs_labels(
                     "means that using logprobs to extract the labels is not reliable, "
                     "and we will instead fall back to extracting the labels using "
                     "word edit distance.",
-                    level=logging.INFO,
+                    level=logging.DEBUG,
                 )
             return None

euroeval/task_group_utils/text_to_text.py CHANGED Viewed

@@ -7,6 +7,7 @@ import numpy as np
 from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
 from ..exceptions import InvalidBenchmark
+from ..logging_utils import log
 from ..metrics import HuggingFaceMetric
 from ..utils import raise_if_model_output_contains_nan_values
@@ -18,9 +19,6 @@ if t.TYPE_CHECKING:
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
@@ -44,6 +42,10 @@ def compute_metrics(
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
         values.
+    Raises:
+        InvalidBenchmark:
+            If the metric computation fails.
     """
     model_outputs, labels = model_outputs_and_labels
@@ -72,7 +74,7 @@ def compute_metrics(
         ):
             metric.compute_kwargs["device"] = benchmark_config.device.type
-        while True:
+        for _ in range(num_attempts := 5):
             try:
                 score: float | None = metric(
                     predictions=predictions,
@@ -96,21 +98,28 @@ def compute_metrics(
                     and metric.compute_kwargs.get("device", "cpu") != "cpu"
                 ):
                     metric.compute_kwargs["device"] = "cpu"
-                    logger.debug(
+                    log(
                         "Out of memory error occurred during the computation of "
                         f"the metric {metric.pretty_name}. Moving the computation to "
-                        "the CPU."
+                        "the CPU.",
+                        level=logging.DEBUG,
                     )
                 else:
                     raise InvalidBenchmark(str(e)) from e
             finally:
                 for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
                     if hasattr(metric, attribute):
-                        logger.debug(
+                        log(
                             f"Deleting the {attribute!r} attribute of the metric "
-                            f"{metric.pretty_name} to free up memory."
+                            f"{metric.pretty_name} to free up memory.",
+                            level=logging.DEBUG,
                         )
                         delattr(metric, attribute)
+        else:
+            raise InvalidBenchmark(
+                f"Could not compute the metric {metric.pretty_name} after "
+                f"{num_attempts} attempts due to out of memory errors."
+            )
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process

euroeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -7,6 +7,7 @@ from copy import deepcopy
 import numpy as np
 from ..exceptions import InvalidBenchmark
+from ..logging_utils import log
 from ..utils import (
     extract_json_dict_from_string,
     raise_if_model_output_contains_nan_values,
@@ -22,9 +23,6 @@ if t.TYPE_CHECKING:
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     has_misc_tags: bool,
@@ -216,17 +214,19 @@ def extract_labels_from_generation(
         prompt_label_mapping = dataset_config.prompt_label_mapping
         for prompt_tag_name, named_entities in prediction_dict.items():
             if not isinstance(named_entities, list):
-                logger.debug(
+                log(
                     "The model produced an invalid format for the named entities. "
-                    f"Expected a list but got {type(named_entities)}. Skipping."
+                    f"Expected a list but got {type(named_entities)}. Skipping.",
+                    level=logging.DEBUG,
                 )
                 continue
             try:
                 named_entities = [str(ne) for ne in named_entities]
             except Exception:
-                logger.debug(
+                log(
                     "The model produced an invalid format for the named entities. "
-                    f"Expected a list of strings but got {named_entities}. Skipping."
+                    f"Expected a list of strings but got {named_entities}. Skipping.",
+                    level=logging.DEBUG,
                 )
                 continue
             try:
@@ -236,9 +236,10 @@ def extract_labels_from_generation(
                     if prompt_tag == prompt_tag_name
                 ][0]
             except IndexError:
-                logger.debug(
+                log(
                     "The model produced an invalid prompt tag name, "
-                    f"{prompt_tag_name}. Skipping."
+                    f"{prompt_tag_name}. Skipping.",
+                    level=logging.DEBUG,
                 )
                 continue

EuroEval 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.3.0py3-none-any.whl → 16.4.0py3-none-any.whl