PyPI - EuroEval - Versions diffs - 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl - Mend

EuroEval 15.16.0py3-none-any.whl → 16.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show

euroeval/__init__.py +8 -7
euroeval/benchmark_config_factory.py +3 -7
euroeval/benchmark_modules/base.py +35 -19
euroeval/benchmark_modules/fresh.py +24 -19
euroeval/benchmark_modules/hf.py +136 -154
euroeval/benchmark_modules/litellm.py +190 -110
euroeval/benchmark_modules/vllm.py +199 -139
euroeval/benchmarker.py +49 -22
euroeval/cli.py +3 -3
euroeval/constants.py +19 -15
euroeval/data_loading.py +33 -28
euroeval/data_models.py +73 -23
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/danish.py +35 -1
euroeval/dataset_configs/dutch.py +38 -1
euroeval/dataset_configs/english.py +38 -1
euroeval/dataset_configs/estonian.py +95 -0
euroeval/dataset_configs/faroese.py +38 -0
euroeval/dataset_configs/finnish.py +39 -1
euroeval/dataset_configs/french.py +38 -1
euroeval/dataset_configs/german.py +38 -1
euroeval/dataset_configs/icelandic.py +39 -1
euroeval/dataset_configs/italian.py +38 -1
euroeval/dataset_configs/latvian.py +81 -0
euroeval/dataset_configs/norwegian.py +38 -1
euroeval/dataset_configs/portuguese.py +38 -1
euroeval/dataset_configs/spanish.py +38 -1
euroeval/dataset_configs/swedish.py +38 -1
euroeval/enums.py +0 -6
euroeval/finetuning.py +6 -6
euroeval/generation.py +25 -14
euroeval/generation_utils.py +90 -20
euroeval/languages.py +947 -187
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +76 -0
euroeval/metrics/huggingface.py +192 -0
euroeval/metrics/llm_as_a_judge.py +257 -0
euroeval/metrics/pipeline.py +276 -0
euroeval/metrics/speed.py +51 -0
euroeval/model_cache.py +13 -1
euroeval/prompt_templates/linguistic_acceptability.py +40 -2
euroeval/prompt_templates/multiple_choice.py +23 -2
euroeval/prompt_templates/named_entity_recognition.py +65 -2
euroeval/prompt_templates/reading_comprehension.py +42 -2
euroeval/prompt_templates/sentiment_classification.py +46 -2
euroeval/prompt_templates/summarization.py +24 -4
euroeval/scores.py +7 -2
euroeval/speed_benchmark.py +6 -6
euroeval/task_group_utils/multiple_choice_classification.py +19 -8
euroeval/task_group_utils/question_answering.py +35 -28
euroeval/task_group_utils/sequence_classification.py +128 -42
euroeval/task_group_utils/text_to_text.py +7 -3
euroeval/task_group_utils/token_classification.py +59 -73
euroeval/tasks.py +33 -6
euroeval/tokenization_utils.py +294 -207
euroeval/utils.py +150 -35
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
euroeval-16.0.1.dist-info/RECORD +69 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -738
euroeval/metrics.py +0 -470
euroeval-15.16.0.dist-info/RECORD +0 -63
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0

euroeval/task_group_utils/text_to_text.py CHANGED Viewed

@@ -75,7 +75,11 @@ def compute_metrics(
         while True:
             try:
                 score: float | None = metric(
-                    predictions=predictions, references=labels, dataset=dataset
+                    predictions=predictions,
+                    references=labels,
+                    dataset=dataset,
+                    dataset_config=dataset_config,
+                    benchmark_config=benchmark_config,
                 )
                 break
             except Exception as e:
@@ -85,7 +89,7 @@ def compute_metrics(
                     "MPS backend out of memory",
                 ]
                 if not any(error in str(e) for error in oom_error):
-                    raise InvalidBenchmark(str(e))
+                    raise InvalidBenchmark(str(e)) from e
                 if (
                     isinstance(metric, HuggingFaceMetric)
@@ -98,7 +102,7 @@ def compute_metrics(
                         "the CPU."
                     )
                 else:
-                    raise InvalidBenchmark(str(e))
+                    raise InvalidBenchmark(str(e)) from e
             finally:
                 for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
                     if hasattr(metric, attribute):

euroeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -1,15 +1,16 @@
 """Utility functions related to the token-classification task group."""
 import logging
-import re
 import typing as t
 from copy import deepcopy
-import demjson3
 import numpy as np
 from ..exceptions import InvalidBenchmark
-from ..utils import raise_if_model_output_contains_nan_values
+from ..utils import (
+    extract_json_dict_from_string,
+    raise_if_model_output_contains_nan_values,
+)
 if t.TYPE_CHECKING:
     from datasets.arrow_dataset import Dataset
@@ -17,7 +18,7 @@ if t.TYPE_CHECKING:
     from transformers.tokenization_utils_base import BatchEncoding
     from transformers.trainer_utils import EvalPrediction
-    from ..data_models import DatasetConfig, GenerativeModelOutput
+    from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
@@ -28,6 +29,7 @@ def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     has_misc_tags: bool,
     dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
     dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -40,6 +42,8 @@ def compute_metrics(
             Whether the dataset has MISC tags.
         dataset_config:
             The configuration of the dataset.
+        benchmark_config:
+            The configuration of the benchmark.
         dataset:
             The dataset used for evaluation. This is only used in case any additional
             metadata is used to compute the metrics.
@@ -142,7 +146,11 @@ def compute_metrics(
             if metric.name == "micro_f1"
         )
         micro_f1_score = metric(
-            predictions=predictions, references=list(labels), dataset=dataset
+            predictions=predictions,
+            references=list(labels),
+            dataset=dataset,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
         )
     # Compute the metrics without MISC tags
@@ -165,7 +173,11 @@ def compute_metrics(
             if metric.name == "micro_f1_no_misc"
         )
         micro_f1_no_misc_score = metric(
-            predictions=predictions_no_misc, references=labels_no_misc, dataset=dataset
+            predictions=predictions_no_misc,
+            references=labels_no_misc,
+            dataset=dataset,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
         )
     # Raise error if the metrics are invalid
@@ -194,55 +206,29 @@ def extract_labels_from_generation(
     Returns:
         The predicted labels.
     """
-    raw_predictions = model_output.sequences
-    # Attempt to extract the JSON dictionary from the predictions
-    json_regex = r"\{[^{}]+?\}"
-    json_matches = [
-        re.search(pattern=json_regex, string=raw_prediction, flags=re.DOTALL)
-        or raw_prediction
-        for raw_prediction in raw_predictions
-    ]
-    raw_predictions = [
-        json_match.group() if isinstance(json_match, re.Match) else json_match
-        for json_match in json_matches
-    ]
     tokens = input_batch["tokens"]
     predicted_labels: list[list[str]] = [["o"] * len(token_ids) for token_ids in tokens]
-    for idx, raw_prediction in enumerate(raw_predictions):
-        try:
-            json_output = demjson3.decode(txt=raw_prediction)
-            if not isinstance(json_output, dict):
-                logger.debug(
-                    "The model output is not a JSON dictionary, so cannot parse "
-                    f"it. Skipping. Here is the output: {raw_prediction}"
-                )
-                continue
-            elif not all(isinstance(key, str) for key in json_output.keys()):
+    for idx, raw_prediction in enumerate(model_output.sequences):
+        prediction_dict = extract_json_dict_from_string(s=raw_prediction)
+        if prediction_dict is None:
+            continue
+        prompt_label_mapping = dataset_config.prompt_label_mapping
+        for prompt_tag_name, named_entities in prediction_dict.items():
+            if not isinstance(named_entities, list):
                 logger.debug(
-                    "The model output is not a JSON dictionary with string keys, "
-                    "so cannot parse it. Skipping. Here is the output: "
-                    f"{raw_prediction}"
+                    "The model produced an invalid format for the named entities. "
+                    f"Expected a list but got {type(named_entities)}. Skipping."
                 )
                 continue
-            elif not all(isinstance(value, list) for value in json_output.values()):
+            try:
+                named_entities = [str(ne) for ne in named_entities]
+            except Exception:
                 logger.debug(
-                    "The model output is not a JSON dictionary with list values, "
-                    "so cannot parse it. Skipping. Here is the output: "
-                    f"{raw_prediction}"
+                    "The model produced an invalid format for the named entities. "
+                    f"Expected a list of strings but got {named_entities}. Skipping."
                 )
                 continue
-            prediction_dict: dict[str, list[str]] = json_output
-        except demjson3.JSONDecodeError:
-            logger.debug(
-                "The model output is not valid JSON, so cannot parse it. Skipping. "
-                f"Here is the output: {raw_prediction!r}"
-            )
-            continue
-        prompt_label_mapping = dataset_config.prompt_label_mapping
-        for prompt_tag_name, named_entities in prediction_dict.items():
             try:
                 tag_name = [
                     tag[2:]
@@ -272,15 +258,15 @@ def extract_labels_from_generation(
 def tokenize_and_align_labels(
-    examples: dict, tokenizer: "PreTrainedTokenizer", label2id: dict[str, int]
+    examples: dict, tokeniser: "PreTrainedTokenizer", label2id: dict[str, int]
 ) -> "BatchEncoding":
     """Tokenise all texts and align the labels with them.
     Args:
         examples:
             The examples to be tokenised.
-        tokenizer:
-            A pretrained tokenizer.
+        tokeniser:
+            A pretrained tokeniser.
         label2id:
             A dictionary that converts NER tags to IDs.
@@ -289,22 +275,22 @@ def tokenize_and_align_labels(
     """
     # Tokenize the texts. We use the `is_split_into_words` argument here because
     # the texts in our dataset are lists of words (with a label for each word)
-    tokenized_inputs = tokenizer(
+    tokenized_inputs = tokeniser(
         examples["tokens"], is_split_into_words=True, truncation=True, padding=True
     )
     # Extract a mapping between all the tokens and their corresponding word. If the
-    # tokenizer is of a "fast" variant then this can be accessed through the
+    # tokeniser is of a "fast" variant then this can be accessed through the
     # `word_ids` method. Otherwise, we have to extract it manually.
     all_labels: list[list[int]] = list()
     labels: list[str]
     word_ids: list[int | None]
     for i, labels in enumerate(examples["labels"]):
-        # Try to get the word IDs from the tokenizer
+        # Try to get the word IDs from the tokeniser
         try:
             word_ids = tokenized_inputs.word_ids(batch_index=i)
-        # If the tokenizer is not of a "fast" variant, we have to extract the word
+        # If the tokeniser is not of a "fast" variant, we have to extract the word
         # IDs manually
         except ValueError:
             # Get the list of words in the document
@@ -314,7 +300,7 @@ def tokenize_and_align_labels(
             tok_ids: list[int] = tokenized_inputs.input_ids[i]
             # Decode the token IDs
-            tokens = tokenizer.convert_ids_to_tokens(tok_ids)
+            tokens = tokeniser.convert_ids_to_tokens(tok_ids)
             assert isinstance(tokens, list)
             # Remove prefixes from the tokens
@@ -326,14 +312,14 @@ def tokenize_and_align_labels(
                             tokens[tok_idx] = tok[len(prefix) :]
             # Replace UNK tokens with the correct word
-            tokens = handle_unk_tokens(tokenizer=tokenizer, tokens=tokens, words=words)
+            tokens = handle_unk_tokens(tokeniser=tokeniser, tokens=tokens, words=words)
-            # Get list of special tokens. Some tokenizers do not record these
+            # Get list of special tokens. Some tokenisers do not record these
             # properly, which is why we convert the values to their indices and
             # then back to strings
             sp_toks = [
-                tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids(sp_tok))
-                for sp_tok in tokenizer.special_tokens_map.values()
+                tokeniser.convert_ids_to_tokens(tokeniser.convert_tokens_to_ids(sp_tok))
+                for sp_tok in tokeniser.special_tokens_map.values()
             ]
             # Replace special tokens with `None`
@@ -357,7 +343,7 @@ def tokenize_and_align_labels(
             if len(word_idxs) != len(token_idxs):
                 raise InvalidBenchmark(
                     "The tokens could not be aligned with the words during manual "
-                    "word-token alignment. It seems that the tokenizer is neither "
+                    "word-token alignment. It seems that the tokeniser is neither "
                     "of the fast variant nor of a SentencePiece/WordPiece variant."
                 )
@@ -387,9 +373,9 @@ def tokenize_and_align_labels(
                 label = labels[word_id]
                 try:
                     label_id = label2id[label.lower()]
-                except KeyError:
+                except KeyError as e:
                     msg = f"The label {label} was not found in the model's config."
-                    raise InvalidBenchmark(msg)
+                    raise InvalidBenchmark(msg) from e
                 label_ids.append(label_id)
             # For the other tokens in a word, we set the label to -100
@@ -404,13 +390,13 @@ def tokenize_and_align_labels(
 def handle_unk_tokens(
-    tokenizer: "PreTrainedTokenizer", tokens: list[str], words: list[str]
+    tokeniser: "PreTrainedTokenizer", tokens: list[str], words: list[str]
 ) -> list[str]:
     """Replace unknown tokens in the tokens with the corresponding word.
     Args:
-        tokenizer:
-            The tokenizer used to tokenize the words.
+        tokeniser:
+            The tokeniser used to tokenize the words.
         tokens:
             The list of tokens.
         words:
@@ -420,15 +406,15 @@ def handle_unk_tokens(
         The list of tokens with unknown tokens replaced by the corresponding word.
     """
     # Locate the token indices of the unknown tokens
-    token_unk_idxs = [i for i, tok in enumerate(tokens) if tok == tokenizer.unk_token]
+    token_unk_idxs = [i for i, tok in enumerate(tokens) if tok == tokeniser.unk_token]
     # Locate the word indices of the words which contain an unknown token
     word_unk_idxs = [
         i
         for i, word in enumerate(words)
-        if tokenizer.unk_token
-        in tokenizer.convert_ids_to_tokens(
-            tokenizer.encode(word, add_special_tokens=False)
+        if tokeniser.unk_token
+        in tokeniser.convert_ids_to_tokens(
+            tokeniser.encode(word, add_special_tokens=False)
         )
     ]
@@ -438,8 +424,8 @@ def handle_unk_tokens(
         word = words[word_idx]
         # Tokenize the word, which is now a list containing at least one UNK token
-        tokens_with_unk = tokenizer.convert_ids_to_tokens(
-            tokenizer.encode(word, add_special_tokens=False)
+        tokens_with_unk = tokeniser.convert_ids_to_tokens(
+            tokeniser.encode(word, add_special_tokens=False)
         )
         # Iterate over the tokens in the word
@@ -448,10 +434,10 @@ def handle_unk_tokens(
             # of the content of this token from the word. The result of the `word`
             # variable will be the content of the UNK token.
             # NOTE: This is a bit hacky and not bulletproof. For instance, if the
-            # word is "1925-1950" and the tokenizer splits it into ["[UNK]", "-",
+            # word is "1925-1950" and the tokeniser splits it into ["[UNK]", "-",
             # "19", "50"], then the result will be 2519 instead of 1925. This
             # happens almost never, however, so we can live with it.
-            if possible_unk_token != tokenizer.unk_token:
+            if possible_unk_token != tokeniser.unk_token:
                 word = word.replace(possible_unk_token, "", 1)
         # Replace the token with the word

euroeval/tasks.py CHANGED Viewed

@@ -1,8 +1,9 @@
 """All benchmarks tasks used in EuroEval."""
 from . import metrics as m
+from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
 from .data_models import Task
-from .enums import TaskGroup
+from .enums import GenerativeType, ModelType, TaskGroup
 from .prompt_templates import (
     LA_TEMPLATES,
     MULTIPLE_CHOICE_TEMPLATES,
@@ -28,8 +29,9 @@ LA = Task(
     template_dict=LA_TEMPLATES,
     metrics=[m.mcc_metric, m.macro_f1_metric],
     default_num_few_shot_examples=12,
-    default_max_generated_tokens=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["correct", "incorrect"],
+    uses_logprobs=True,
 )
@@ -51,6 +53,7 @@ NER = Task(
         "b-misc",
         "i-misc",
     ],
+    uses_structured_output=True,
 )
@@ -71,8 +74,9 @@ SENT = Task(
     template_dict=SENT_TEMPLATES,
     metrics=[m.mcc_metric, m.macro_f1_metric],
     default_num_few_shot_examples=12,
-    default_max_generated_tokens=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["positive", "neutral", "negative"],
+    uses_logprobs=True,
 )
@@ -84,6 +88,7 @@ SUMM = Task(
     default_num_few_shot_examples=1,
     default_max_generated_tokens=256,
     default_labels=[],
+    allowed_model_types=[ModelType.GENERATIVE],
 )
@@ -93,8 +98,9 @@ KNOW = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.mcc_metric, m.accuracy_metric],
     default_num_few_shot_examples=5,
-    default_max_generated_tokens=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
+    uses_logprobs=True,
 )
@@ -104,8 +110,9 @@ MCRC = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.mcc_metric, m.accuracy_metric],
     default_num_few_shot_examples=5,
-    default_max_generated_tokens=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
+    uses_logprobs=True,
 )
@@ -115,8 +122,28 @@ COMMON_SENSE = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.mcc_metric, m.accuracy_metric],
     default_num_few_shot_examples=5,
-    default_max_generated_tokens=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
+    uses_logprobs=True,
+)
+EUROPEAN_VALUES = Task(
+    name="european-values",
+    task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
+    template_dict=MULTIPLE_CHOICE_TEMPLATES,
+    metrics=[m.european_values_metric],
+    default_num_few_shot_examples=0,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
+    default_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
+    allowed_model_types=[ModelType.GENERATIVE],
+    allowed_generative_types=[
+        GenerativeType.INSTRUCTION_TUNED,
+        GenerativeType.REASONING,
+    ],
+    requires_zero_shot=True,
+    uses_logprobs=True,
+    allow_invalid_model_outputs=False,
 )

EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl

Potentially problematic release.

EuroEval 15.16.0py3-none-any.whl → 16.0.1py3-none-any.whl