PyPI - EuroEval - Versions diffs - 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl - Mend

EuroEval 15.4.2py3-none-any.whl → 15.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (54) hide show

euroeval/__init__.py +2 -2
euroeval/benchmark_modules/base.py +3 -2
euroeval/benchmark_modules/fresh.py +8 -6
euroeval/benchmark_modules/hf.py +44 -33
euroeval/benchmark_modules/litellm.py +314 -120
euroeval/benchmark_modules/vllm.py +99 -59
euroeval/benchmarker.py +52 -21
euroeval/callbacks.py +2 -2
euroeval/constants.py +9 -2
euroeval/data_models.py +258 -44
euroeval/dataset_configs/__init__.py +61 -0
euroeval/dataset_configs/danish.py +120 -0
euroeval/dataset_configs/dutch.py +123 -0
euroeval/dataset_configs/english.py +88 -0
euroeval/dataset_configs/faroese.py +53 -0
euroeval/dataset_configs/french.py +83 -0
euroeval/dataset_configs/german.py +91 -0
euroeval/dataset_configs/icelandic.py +148 -0
euroeval/dataset_configs/italian.py +81 -0
euroeval/dataset_configs/norwegian.py +178 -0
euroeval/dataset_configs/spanish.py +78 -0
euroeval/dataset_configs/swedish.py +100 -0
euroeval/exceptions.py +10 -10
euroeval/finetuning.py +6 -10
euroeval/generation.py +1 -0
euroeval/human_evaluation.py +2 -2
euroeval/languages.py +20 -13
euroeval/model_cache.py +1 -1
euroeval/model_loading.py +1 -12
euroeval/prompt_templates/__init__.py +8 -0
euroeval/prompt_templates/linguistic_acceptability.py +112 -0
euroeval/prompt_templates/multiple_choice.py +97 -0
euroeval/prompt_templates/named_entity_recognition.py +257 -0
euroeval/prompt_templates/reading_comprehension.py +118 -0
euroeval/prompt_templates/sentiment_classification.py +137 -0
euroeval/prompt_templates/summarization.py +97 -0
euroeval/speed_benchmark.py +1 -1
euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
euroeval/tasks.py +54 -0
euroeval/tokenization_utils.py +343 -0
euroeval/types.py +3 -1
euroeval/utils.py +5 -254
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
euroeval-15.6.0.dist-info/RECORD +59 -0
euroeval/dataset_configs.py +0 -2408
euroeval-15.4.2.dist-info/RECORD +0 -40
/euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0

euroeval/prompt_templates/sentiment_classification.py ADDED Viewed

@@ -0,0 +1,137 @@
+"""Templates for the Sentiment Analysis task."""
+from ..data_models import PromptConfig
+from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
+SENT_TEMPLATES = {
+    DA: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positiv", neutral="neutral", negative="negativ"
+        ),
+        default_prompt_prefix="Følgende er dokumenter og deres sentiment, som kan være "
+        "{labels_str}.",
+        default_prompt_template="Dokument: {text}\nSentiment: {label}",
+        default_instruction_prompt="Dokument: {text}\n\nKlassificer sentimentet i "
+        "dokumentet. Svar kun med {labels_str}, og intet andet.",
+    ),
+    DE: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positiv", neutral="neutral", negative="negativ"
+        ),
+        default_prompt_prefix="Nachfolgend finden Sie Dokumente und ihre Bewertung, "
+        "die {labels_str} sein kann.",
+        default_prompt_template="Dokument: {text}\nStimmung: {label}",
+        default_instruction_prompt="Dokument: {text}\n\nKlassifizieren Sie die "
+        "Stimmung im Dokument. Antworten Sie mit {labels_str}, und nichts anderes.",
+    ),
+    EN: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positive", neutral="neutral", negative="negative"
+        ),
+        default_prompt_prefix="The following are documents and their sentiment, which "
+        "can be {labels_str}.",
+        default_prompt_template="Document: {text}\nSentiment: {label}",
+        default_instruction_prompt="Document: {text}\n\nClassify the sentiment in the "
+        "document. Answer with {labels_str}, and nothing else.",
+    ),
+    ES: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positivo", neutral="neutral", negative="negativo"
+        ),
+        default_prompt_prefix="A continuación se muestran los documentos y su "
+        "sentimiento, que puede ser {labels_str}.",
+        default_prompt_template="Documento: {text}\nSentimiento: {label}",
+        default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
+        "documento. Responde con {labels_str}, y nada más.",
+    ),
+    FO: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positivt", neutral="neutralt", negative="negativt"
+        ),
+        default_prompt_prefix="Niðanfyri eru skjøl og teirra kenslur, sum kunnu vera "
+        "{labels_str}.",
+        default_prompt_template="Skjal: {text}\nKensla: {label}",
+        default_instruction_prompt="Skjal: {text}\n\nFlokka kensluna í skjalinum. "
+        "Svara við {labels_str}, og einki annað.",
+    ),
+    FR: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positif", neutral="neutre", negative="négatif"
+        ),
+        default_prompt_prefix="Les documents suivants sont accompagnés de leur "
+        "sentiment, qui peut être {labels_str}.",
+        default_prompt_template="Document: {text}\nSentiment: {label}",
+        default_instruction_prompt="Document: {text}\n\nClassez le sentiment dans le "
+        "document. Répondez par {labels_str}, et rien d'autre.",
+    ),
+    IS: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="jákvætt", neutral="hlutlaust", negative="neikvætt"
+        ),
+        default_prompt_prefix="Eftirfarandi eru skjöl og viðhorf þeirra, sem geta "
+        "verið {labels_str}.",
+        default_prompt_template="Skjal: {text}\nViðhorf: {label}",
+        default_instruction_prompt="Skjal: {text}\n\nFlokkaðu viðhorfið í skjalinu. "
+        "Svaraðu með {labels_str}, og ekkert annað.",
+    ),
+    IT: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positivo", neutral="neutro", negative="negativo"
+        ),
+        default_prompt_prefix="Di seguito sono riportati i documenti e il loro "
+        "sentiment, che può essere {labels_str}.",
+        default_prompt_template="Documento: {text}\nSentimento: {label}",
+        default_instruction_prompt="Documento: {text}\n\nClassificare il sentiment del "
+        "documento. Rispondere con {labels_str}, e nient'altro.",
+    ),
+    NB: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positiv", neutral="nøytral", negative="negativ"
+        ),
+        default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
+        "{labels_str}",
+        default_prompt_template="Dokument: {text}\nSentiment: {label}",
+        default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
+        "teksten. Svar med {labels_str}, og ikke noe annet.",
+    ),
+    NL: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positief", neutral="neutraal", negative="negatief"
+        ),
+        default_prompt_prefix="Hieronder volgen documenten en hun sentiment, dat "
+        "{labels_str} kan zijn.",
+        default_prompt_template="Document: {text}\nSentiment: {label}",
+        default_instruction_prompt="Document: {text}\n\nClassificeer het sentiment in "
+        "het document. Antwoord met {labels_str}, en verder niets.",
+    ),
+    NN: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positiv", neutral="nøytral", negative="negativ"
+        ),
+        default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
+        "{labels_str}",
+        default_prompt_template="Dokument: {text}\nSentiment: {label}",
+        default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
+        "teksten. Svar med {labels_str}, og ikke noe annet.",
+    ),
+    NO: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positiv", neutral="nøytral", negative="negativ"
+        ),
+        default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
+        "{labels_str}",
+        default_prompt_template="Dokument: {text}\nSentiment: {label}",
+        default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
+        "teksten. Svar med {labels_str}, og ikke noe annet.",
+    ),
+    SV: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positiv", neutral="neutral", negative="negativ"
+        ),
+        default_prompt_prefix="Nedan följer dokument och deras sentiment, som kan vara "
+        "{labels_str}.",
+        default_prompt_template="Dokument: {text}\nSentiment: {label}",
+        default_instruction_prompt="Dokument: {text}\n\nKlassificera känslan i "
+        "dokumentet. Svara med {labels_str}, och inget annat.",
+    ),
+}

euroeval/prompt_templates/summarization.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""Templates for the Summarization task."""
+from ..data_models import PromptConfig
+from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
+# TODO: Missing Faroese
+SUMM_TEMPLATES = {
+    DA: PromptConfig(
+        default_prompt_prefix="Følgende er dokumenter med tilhørende resuméer.",
+        default_prompt_template="Dokument: {text}\nResumé: {target_text}",
+        default_instruction_prompt="Dokument: {text}\n\nSkriv et resumé af ovenstående "
+        "dokument.",
+        default_prompt_label_mapping=dict(),
+    ),
+    DE: PromptConfig(
+        default_prompt_prefix="Nachstehend finden Sie Dokumente mit zugehörigen "
+        "Zusammenfassungen.",
+        default_prompt_template="Dokument: {text}\nZusammenfassung: {target_text}",
+        default_instruction_prompt="Nachrichtenartikel: {text}\n\nSchreiben Sie eine "
+        "Zusammenfassung des oben genannten Dokuments.",
+        default_prompt_label_mapping=dict(),
+    ),
+    EN: PromptConfig(
+        default_prompt_prefix="The following are documents with accompanying "
+        "summaries.",
+        default_prompt_template="Document: {text}\nSummary: {target_text}",
+        default_instruction_prompt="Document: {text}\n\nWrite a summary of the above "
+        "document.",
+        default_prompt_label_mapping=dict(),
+    ),
+    ES: PromptConfig(
+        default_prompt_prefix="A continuación se presentan documentos con resúmenes "
+        "adjuntos.",
+        default_prompt_template="Documento: {text}\nResumen: {target_text}",
+        default_instruction_prompt="Documento: {text}\n\nEscriba un resumen del "
+        "documento anterior.",
+        default_prompt_label_mapping=dict(),
+    ),
+    FR: PromptConfig(
+        default_prompt_prefix="Les documents suivants sont accompagnés d'un résumé.",
+        default_prompt_template="Document: {text}\nRésumé: {target_text}",
+        default_instruction_prompt="Document: {text}\n\nRédigez un résumé du "
+        "document ci-dessus.",
+        default_prompt_label_mapping=dict(),
+    ),
+    IS: PromptConfig(
+        default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
+        default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",
+        default_instruction_prompt="Skjal: {text}\n\nSkrifaðu samantekt á ofangreindu "
+        "skjali.",
+        default_prompt_label_mapping=dict(),
+    ),
+    IT: PromptConfig(
+        default_prompt_prefix="Di seguito sono riportati i documenti con le relative "
+        "sintesi.",
+        default_prompt_template="Documento: {text}\nSintesi: {target_text}",
+        default_instruction_prompt="Documento: {text}\n\nScrivete una sintesi del "
+        "documento di cui sopra.",
+        default_prompt_label_mapping=dict(),
+    ),
+    NB: PromptConfig(
+        default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
+        default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
+        default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
+        "dokumentet ovenfor.",
+        default_prompt_label_mapping=dict(),
+    ),
+    NL: PromptConfig(
+        default_prompt_prefix="Hieronder volgen documenten met bijbehorende "
+        "samenvattingen.",
+        default_prompt_template="Document: {text}\nSamenvatting: {target_text}",
+        default_instruction_prompt="Document: {text}\n\nSchrijf een samenvatting van "
+        "het bovenstaande document.",
+        default_prompt_label_mapping=dict(),
+    ),
+    NN: PromptConfig(
+        default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
+        default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
+        default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
+        "dokumentet ovenfor.",
+        default_prompt_label_mapping=dict(),
+    ),
+    NO: PromptConfig(
+        default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
+        default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
+        default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
+        "dokumentet ovenfor.",
+        default_prompt_label_mapping=dict(),
+    ),
+    SV: PromptConfig(
+        default_prompt_prefix="Nedan följer dokument med tillhörande sammanfattningar.",
+        default_prompt_template="Dokument: {text}\nSammanfattning: {target_text}",
+        default_instruction_prompt="Dokument: {text}\n\nSkriv en sammanfattning av "
+        "ovanstående dokument.",
+        default_prompt_label_mapping=dict(),
+    ),
+}

euroeval/speed_benchmark.py CHANGED Viewed

@@ -4,7 +4,7 @@ import logging
 import pyinfer
 from tqdm.auto import tqdm
-from transformers import AutoTokenizer
+from transformers.models.auto.tokenization_auto import AutoTokenizer
 from .benchmark_modules import (
     BenchmarkModule,

euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py RENAMED Viewed

@@ -8,7 +8,9 @@ from collections import defaultdict
 import numpy as np
 from datasets import Dataset
-from transformers import BatchEncoding, PreTrainedTokenizer, Trainer
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import BatchEncoding
+from transformers.trainer import Trainer
 if t.TYPE_CHECKING:
     from ..types import Labels, Predictions
@@ -19,12 +21,12 @@ logger = logging.getLogger("euroeval")
 class MultipleChoiceClassificationTrainer(Trainer):
     """Trainer subclass for question answering tasks."""
-    def evaluate(
+    def evaluate(  # type: ignore[override]
         self,
         eval_dataset: "Dataset | None" = None,
         ignore_keys: list[str] | None = None,
         metric_key_prefix: str = "eval",
-    ) -> dict[str, float] | None:
+    ) -> dict[str, float]:
         """Evaluate the model on the given dataset.
         Args:
@@ -54,22 +56,28 @@ class MultipleChoiceClassificationTrainer(Trainer):
             metric_key_prefix=metric_key_prefix,
         )
+        predictions = output.predictions
+        assert isinstance(predictions, np.ndarray)
+        metrics = output.metrics
+        assert metrics is not None
         if metric_key_prefix == "test":
             preds_and_labels = postprocess_predictions_and_labels(
-                predictions=output.predictions, dataset=eval_dataset
+                predictions=predictions, dataset=eval_dataset
             )
-            output.metrics.update(self.compute_metrics(preds_and_labels))
+            assert self.compute_metrics is not None
+            new_metrics = self.compute_metrics(preds_and_labels)  # type: ignore[arg-type]
+            metrics.update(new_metrics)
             # Prefix all keys with metric_key_prefix + '_'
-            for key in list(output.metrics.keys()):
+            for key in list(metrics.keys()):
                 if not key.startswith(f"{metric_key_prefix}_"):
-                    output.metrics[f"{metric_key_prefix}_{key}"] = output.metrics.pop(
-                        key
-                    )
+                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
         # Only the main node log the results by default
         if self.args.should_log:
-            self.log(output.metrics)
+            self.log(metrics)
         self.control = self.callback_handler.on_evaluate(
             self.args,
@@ -77,7 +85,7 @@ class MultipleChoiceClassificationTrainer(Trainer):
             self.control,  # type: ignore[has-type]
             output.metrics,
         )
-        return output.metrics
+        return metrics
 def prepare_examples(

euroeval/{task_utils → task_group_utils}/question_answering.py RENAMED Viewed

@@ -8,25 +8,22 @@ from collections import defaultdict
 import evaluate
 import numpy as np
 from evaluate import EvaluationModule
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.trainer import Trainer
 from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
-from ..utils import (
-    get_special_token_metadata,
-    raise_if_model_output_contains_nan_values,
-)
+from ..tokenization_utils import get_special_token_metadata
+from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
     import torch.nn as nn
     from datasets.arrow_dataset import Dataset
-    from transformers import (
-        EvalPrediction,
-        PreTrainedModel,
-        TrainerCallback,
-        TrainingArguments,
-    )
+    from transformers.modeling_utils import PreTrainedModel
     from transformers.tokenization_utils_base import BatchEncoding
+    from transformers.trainer_callback import TrainerCallback
+    from transformers.trainer_utils import EvalPrediction
+    from transformers.training_args import TrainingArguments
     from ..types import Labels, Predictions
@@ -47,7 +44,7 @@ class QuestionAnsweringTrainer(Trainer):
         callbacks: "list[TrainerCallback]",
         data_collator: "c.Callable",
     ) -> None:
-        """Initialize the trainer."""
+        """Initialise the trainer."""
         super().__init__(
             model=model,
             processing_class=processing_class,
@@ -68,13 +65,13 @@ class QuestionAnsweringTrainer(Trainer):
         # Set the label names
         self.label_names = ["start_positions", "end_positions"]
-    def evaluate(
+    def evaluate(  # type: ignore[override]
         self,
         eval_dataset: "Dataset | None" = None,
         orig_eval_dataset: "Dataset | None" = None,
         ignore_keys: list[str] | None = None,
         metric_key_prefix: str = "eval",
-    ) -> dict[str, float] | None:
+    ) -> dict[str, float]:
         """Evaluate the model on the given dataset.
         Args:
@@ -113,33 +110,39 @@ class QuestionAnsweringTrainer(Trainer):
         finally:
             self.compute_metrics = compute_metrics
+        predictions = output.predictions
+        assert isinstance(predictions, tuple)
+        metrics = output.metrics
+        assert metrics is not None
         if orig_eval_dataset is not None:
             preds_and_labels = postprocess_predictions_and_labels(
-                predictions=output.predictions,
+                predictions=predictions,  # type: ignore[arg-type]
                 dataset=orig_eval_dataset,
                 prepared_dataset=eval_dataset,
                 cls_token_index=self.cls_token_id,
             )
-            output.metrics.update(self.compute_metrics(preds_and_labels))
+            assert self.compute_metrics is not None
+            new_metrics = self.compute_metrics(preds_and_labels)  # type: ignore[arg-type]
+            metrics.update(new_metrics)
             # Prefix all keys with metric_key_prefix + '_'
-            for key in list(output.metrics.keys()):
+            for key in list(metrics.keys()):
                 if not key.startswith(f"{metric_key_prefix}_"):
-                    output.metrics[f"{metric_key_prefix}_{key}"] = output.metrics.pop(
-                        key
-                    )
+                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
         # Only the main node log the results by default
         if self.args.should_log:
-            self.log(output.metrics)
+            self.log(metrics)
         self.control = self.callback_handler.on_evaluate(
             self.args,
             self.state,
             self.control,  # type: ignore[has-type]
-            output.metrics,
+            metrics,
         )
-        return output.metrics
+        return metrics
 def compute_metrics(
@@ -472,7 +475,7 @@ def prepare_test_examples(
 def postprocess_predictions_and_labels(
-    predictions: list,
+    predictions: tuple[np.ndarray, np.ndarray],
     dataset: "Dataset",
     prepared_dataset: "Dataset",
     cls_token_index: int,
@@ -492,9 +495,7 @@ def postprocess_predictions_and_labels(
     Returns:
         The postprocessed predictions and labels.
     """
-    # Extract the logits from the predictions
-    all_start_logits = predictions[0]
-    all_end_logits = predictions[1]
+    all_start_logits, all_end_logits = predictions
     # Build a map from an example to its corresponding features, being the blocks of
     # text from the context that we're feeding into the model. An example can have
@@ -507,7 +508,7 @@ def postprocess_predictions_and_labels(
         features_per_example[example_index].append(i)
     # Loop over all the examples
-    predictions = list()
+    prediction_list: list[dict[str, t.Any]] = list()
     labels = list()
     for example_index, example in enumerate(dataset):
         # Extract the best valid answer associated with the current example
@@ -530,7 +531,7 @@ def postprocess_predictions_and_labels(
         )
         # Add the answer to the list of predictions
-        predictions.append(prediction)
+        prediction_list.append(prediction)
         # Create the associated reference dictionary, to be added to the list of
         # references
@@ -545,7 +546,7 @@ def postprocess_predictions_and_labels(
         # Add the answer and label to the list of predictions and labels, respectively
         labels.append(label)
-    return predictions, labels
+    return prediction_list, labels
 def find_best_answer(

euroeval/{task_utils → task_group_utils}/sequence_classification.py RENAMED Viewed

@@ -10,10 +10,11 @@ import numpy as np
 from evaluate import EvaluationModule
 from ..data_models import BenchmarkConfig, GenerativeModelOutput
+from ..exceptions import InvalidBenchmark
 from ..utils import log_once, raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
-    from transformers import EvalPrediction
+    from transformers.trainer_utils import EvalPrediction
     from ..data_models import DatasetConfig
     from ..types import Labels, Predictions
@@ -110,6 +111,7 @@ def extract_labels_from_generation(
     input_batch: dict[str, list],
     model_output: GenerativeModelOutput,
     dataset_config: "DatasetConfig",
+    first_label_token_mapping: dict[str, str] | bool,
 ) -> list[str]:
     """Extract the predicted labels from the generated output.
@@ -121,13 +123,19 @@ def extract_labels_from_generation(
             The raw generated output of the model.
         dataset_config:
             The configuration of the dataset.
+        first_label_token_mapping:
+            A mapping from labels to the first token in each label, or alternatively a
+            Boolean value indicating whether the model should output scores (if the
+            mapping is outputted then the model will always output scores).
     Returns:
         The predicted labels.
     """
     if model_output.scores is not None:
         return get_closest_logprobs_labels(
-            generation_logprobs=model_output.scores, dataset_config=dataset_config
+            generation_logprobs=model_output.scores,
+            dataset_config=dataset_config,
+            first_label_token_mapping=first_label_token_mapping,
         )
     else:
         return get_closest_word_edit_labels(
@@ -138,6 +146,7 @@ def extract_labels_from_generation(
 def get_closest_logprobs_labels(
     generation_logprobs: list[list[list[tuple[str, float]]]],
     dataset_config: "DatasetConfig",
+    first_label_token_mapping: dict[str, str] | bool,
 ) -> list[str]:
     """Get the labels with the highest predicted logprob value.
@@ -152,6 +161,10 @@ def get_closest_logprobs_labels(
             (batch_size, num_tokens, num_logprobs).
         dataset_config:
             The configuration of the dataset.
+        first_label_token_mapping:
+            A mapping from labels to the first token in each label, or alternatively a
+            Boolean value indicating whether the model should output scores (if the
+            mapping is outputted then the model will always output scores).
     Returns:
         The predicted labels.
@@ -185,11 +198,29 @@ def get_closest_logprobs_labels(
                 generated_label = "".join(previously_generated_labels) + generated_label
                 # Get the candidate labels that starts with the generated label
-                candidate_output_labels = {
-                    candidate_label
-                    for candidate_label in candidate_labels
-                    if candidate_label.startswith(generated_label)
-                }
+                if isinstance(first_label_token_mapping, dict):
+                    if any(
+                        candidate_label not in first_label_token_mapping
+                        for candidate_label in candidate_labels
+                    ):
+                        raise InvalidBenchmark(
+                            "There is a label not present in the first label token "
+                            "mapping - this should never happen! Please report this "
+                            "issue to the EuroEval team at "
+                            "github.com/EuroEval/EuroEval/issues."
+                        )
+                    candidate_output_labels = {
+                        candidate_label
+                        for candidate_label in candidate_labels
+                        if generated_label == first_label_token_mapping[candidate_label]
+                    }
+                else:
+                    candidate_output_labels = {
+                        candidate_label
+                        for candidate_label in candidate_labels
+                        if candidate_label.startswith(generated_label)
+                    }
                 # If we can uniquely determine the output label, we break the loop. If
                 # there are multiple possible labels then we store the current one, and
@@ -206,7 +237,7 @@ def get_closest_logprobs_labels(
                     else:
                         output_label = candidate_output_labels.pop()
                         candidate_output_labels.add(output_label)
-                        log_once(
+                        raise InvalidBenchmark(
                             "Multiple candidate labels found for the generated label "
                             f"{generated_label!r}: {candidate_output_labels}. Since "
                             "this is not the first generated label, we cannot "
@@ -214,9 +245,13 @@ def get_closest_logprobs_labels(
                             f"forced to use the arbitrary {output_label!r} as the "
                             "output label, potentially resulting in worse performance. "
                             "Please report this issue to the EuroEval team at "
-                            "github.com/EuroEval/EuroEval/issues.",
-                            level=logging.WARNING,
+                            "github.com/EuroEval/EuroEval/issues."
                         )
+                elif len(candidate_output_labels) == 0:
+                    logger.debug(
+                        f"No candidate label found for the generated label "
+                        f"{generated_label!r}. The generated label is thus ignored."
+                    )
             if output_label is not None:
                 output_labels.append(output_label)

euroeval/{task_utils → task_group_utils}/text_to_text.py RENAMED Viewed

@@ -17,7 +17,7 @@ from ..utils import (
 )
 if t.TYPE_CHECKING:
-    from transformers import EvalPrediction
+    from transformers.trainer_utils import EvalPrediction
     from ..types import Labels, Predictions

euroeval/{task_utils → task_group_utils}/token_classification.py RENAMED Viewed

@@ -9,14 +9,15 @@ import demjson3
 import evaluate
 import numpy as np
 from evaluate import EvaluationModule
-from transformers import PreTrainedTokenizer
+from transformers.tokenization_utils import PreTrainedTokenizer
 from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
 from ..exceptions import InvalidBenchmark
 from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
-    from transformers import BatchEncoding, EvalPrediction
+    from transformers.tokenization_utils_base import BatchEncoding
+    from transformers.trainer_utils import EvalPrediction
     from ..types import Labels, Predictions

EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.4.2py3-none-any.whl → 15.6.0py3-none-any.whl