PyPI - EuroEval - Versions diffs - 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl - Mend

EuroEval 16.0.1py3-none-any.whl → 16.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (48) hide show

euroeval/benchmark_config_factory.py +6 -1
euroeval/benchmark_modules/base.py +2 -0
euroeval/benchmark_modules/fresh.py +7 -1
euroeval/benchmark_modules/hf.py +26 -21
euroeval/benchmark_modules/litellm.py +258 -131
euroeval/benchmark_modules/vllm.py +79 -40
euroeval/benchmarker.py +11 -2
euroeval/cli.py +14 -1
euroeval/constants.py +1 -1
euroeval/data_models.py +77 -6
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +14 -0
euroeval/dataset_configs/dutch.py +14 -0
euroeval/dataset_configs/english.py +22 -0
euroeval/dataset_configs/estonian.py +15 -7
euroeval/dataset_configs/finnish.py +14 -0
euroeval/dataset_configs/french.py +14 -0
euroeval/dataset_configs/german.py +23 -0
euroeval/dataset_configs/italian.py +14 -0
euroeval/dataset_configs/latvian.py +14 -0
euroeval/dataset_configs/norwegian.py +14 -0
euroeval/dataset_configs/polish.py +126 -0
euroeval/dataset_configs/portuguese.py +14 -0
euroeval/dataset_configs/spanish.py +14 -0
euroeval/dataset_configs/swedish.py +25 -0
euroeval/enums.py +12 -0
euroeval/generation.py +17 -8
euroeval/generation_utils.py +65 -11
euroeval/metrics/pipeline.py +1 -1
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +27 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +11 -0
euroeval/prompt_templates/sentiment_classification.py +15 -0
euroeval/prompt_templates/summarization.py +27 -1
euroeval/scores.py +5 -0
euroeval/task_group_utils/question_answering.py +29 -29
euroeval/task_group_utils/sequence_classification.py +11 -34
euroeval/task_group_utils/token_classification.py +3 -3
euroeval/tasks.py +4 -4
euroeval/{tokenization_utils.py → tokenisation_utils.py} +50 -28
euroeval/utils.py +36 -3
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/METADATA +1 -1
euroeval-16.1.1.dist-info/RECORD +70 -0
euroeval-16.0.1.dist-info/RECORD +0 -69
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/WHEEL +0 -0
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/entry_points.txt +0 -0
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/licenses/LICENSE +0 -0

euroeval/prompt_templates/linguistic_acceptability.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ..languages import (
     NL,
     NN,
     NO,
+    PL,
     PT,
     SV,
 )
@@ -67,6 +68,14 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
         default_instruction_prompt="Lause: {text}\n\nOtsusta, kas lause on "
         "grammatiliselt õige või mitte. Vasta {labels_str}, ja mitte midagi muud.",
     ),
+    PL: PromptConfig(
+        default_prompt_label_mapping=dict(correct="tak", incorrect="nie"),
+        default_prompt_prefix="Poniżej znajdują się teksty i czy są "
+        "gramatycznie poprawne.",
+        default_prompt_template="Tekst: {text}\nGramatycznie poprawny: {label}",
+        default_instruction_prompt="Tekst: {text}\n\nOkreśl czy tekst jest "
+        "gramatycznie poprawny czy nie. Odpowiedz {labels_str}, i nic więcej.",
+    ),
     PT: PromptConfig(
         default_prompt_label_mapping=dict(correct="sim", incorrect="não"),
         default_prompt_prefix="Seguem-se abaixo textos e se são "

euroeval/prompt_templates/multiple_choice.py CHANGED Viewed

@@ -3,7 +3,25 @@
 import typing as t
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, ET, FI, FR, IS, IT, LV, NB, NL, NN, NO, PT, SV
+from ..languages import (
+    DA,
+    DE,
+    EN,
+    ES,
+    ET,
+    FI,
+    FR,
+    IS,
+    IT,
+    LV,
+    NB,
+    NL,
+    NN,
+    NO,
+    PL,
+    PT,
+    SV,
+)
 if t.TYPE_CHECKING:
     from ..data_models import Language
@@ -123,6 +141,14 @@ MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
         "{labels_str}, og ikke noe annet.",
         default_prompt_label_mapping="auto",
     ),
+    PL: PromptConfig(
+        default_prompt_prefix="Poniżej znajdują się pytania wielokrotnego wyboru "
+        "(z odpowiedziami).",
+        default_prompt_template="Pytanie: {text}\nOdpowiedź: {label}",
+        default_instruction_prompt="Pytanie: {text}\n\nOdpowiedz na powyższe pytanie, "
+        "odpowiadając {labels_str}, i nic więcej.",
+        default_prompt_label_mapping="auto",
+    ),
     SV: PromptConfig(
         default_prompt_prefix="Följande är flervalsfrågor (med svar).",
         default_prompt_template="Fråga: {text}\nSvar: {label}",

euroeval/prompt_templates/named_entity_recognition.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ..languages import (
     NL,
     NN,
     NO,
+    PL,
     PT,
     SV,
 )
@@ -336,6 +337,25 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
         "Verdiene skal være lister over de navngitte enhetene "
         "av den typen, akkurat som de vises i frasen.",
     ),
+    PL: PromptConfig(
+        default_prompt_label_mapping={
+            "b-per": "osoba",
+            "i-per": "osoba",
+            "b-loc": "lokalizacja",
+            "i-loc": "lokalizacja",
+            "b-org": "organizacja",
+            "i-org": "organizacja",
+            "b-misc": "różne",
+            "i-misc": "różne",
+        },
+        default_prompt_prefix="Poniżej znajdują się zdania i słowniki JSON z nazwanymi "
+        "jednostkami występującymi w danym zdaniu.",
+        default_prompt_template="Zdanie: {text}\nNazwane jednostki: {label}",
+        default_instruction_prompt="Zdanie: {text}\n\nZidentyfikuj nazwane jednostki "
+        "w zdaniu. Powinieneś wypisać to jako słownik JSON z kluczami "
+        "{labels_str}. Wartości powinny być listami nazwanych jednostek "
+        "tego typu, dokładnie tak jak pojawiają się w zdaniu.",
+    ),
     SV: PromptConfig(
         default_prompt_label_mapping={
             "b-per": "person",

euroeval/prompt_templates/reading_comprehension.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ..languages import (
     NL,
     NN,
     NO,
+    PL,
     PT,
     SV,
 )
@@ -157,6 +158,16 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
         "teksten ovenfor med maks 3 ord.\n\nSpørsmål: {question}",
         default_prompt_label_mapping=dict(),
     ),
+    PL: PromptConfig(
+        default_prompt_prefix=(
+            "Poniżej znajdują się teksty z towarzyszącymi pytaniami i odpowiedziami."
+        ),
+        default_prompt_template="Tekst: {text}\nPytanie: {question}\nOdpowiedź w "
+        "maksymalnie 3 słowach: {label}",
+        default_instruction_prompt="Tekst: {text}\n\nOdpowiedz na następujące pytanie "
+        "dotyczące powyższego tekstu w maksymalnie 3 słowach.\n\nPytanie: {question}",
+        default_prompt_label_mapping=dict(),
+    ),
     PT: PromptConfig(
         default_prompt_prefix="Os textos que se seguem são acompanhados de perguntas "
         "e respostas.",

euroeval/prompt_templates/sentiment_classification.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ..languages import (
     NL,
     NN,
     NO,
+    PL,
     PT,
     SV,
 )
@@ -78,6 +79,20 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
         "meelestatuse järgi. Võimalikud vastused: {labels_str}. Muud vastused "
         "ei ole lubatud.",
     ),
+    PL: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="pozytywny", neutral="neutralny", negative="negatywny"
+        ),
+        default_prompt_prefix=(
+            "Poniżej znajdują się dokumenty i ich sentyment, który może być "
+            "{labels_str}."
+        ),
+        default_prompt_template="Dokument: {text}\nSentyment: {label}",
+        default_instruction_prompt=(
+            "Dokument: {text}\n\nKlasyfikuj sentyment w dokumencie. "
+            "Odpowiedz z {labels_str}, i nic więcej."
+        ),
+    ),
     PT: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="positivo", neutral="neutro", negative="negativo"

euroeval/prompt_templates/summarization.py CHANGED Viewed

@@ -3,7 +3,25 @@
 import typing as t
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, ET, FI, FR, IS, IT, LV, NB, NL, NN, NO, PT, SV
+from ..languages import (
+    DA,
+    DE,
+    EN,
+    ES,
+    ET,
+    FI,
+    FR,
+    IS,
+    IT,
+    LV,
+    NB,
+    NL,
+    NN,
+    NO,
+    PL,
+    PT,
+    SV,
+)
 if t.TYPE_CHECKING:
     from ..data_models import Language
@@ -122,6 +140,14 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
         "dokumentet ovenfor.",
         default_prompt_label_mapping=dict(),
     ),
+    PL: PromptConfig(
+        default_prompt_prefix="Poniżej znajdują się artykuły z towarzyszącymi "
+        "streszczeniami.",
+        default_prompt_template="Artykuł: {text}\nStreszczenie: {target_text}",
+        default_instruction_prompt="Artykuł: {text}\n\nNapisz streszczenie "
+        "powyższego artykułu.",
+        default_prompt_label_mapping=dict(),
+    ),
     SV: PromptConfig(
         default_prompt_prefix="Nedan följer dokument med tillhörande sammanfattningar.",
         default_prompt_template="Dokument: {text}\nSammanfattning: {target_text}",

euroeval/scores.py CHANGED Viewed

@@ -19,6 +19,7 @@ def log_scores(
     scores: list[dict[str, float]],
     model_id: str,
     model_revision: str,
+    model_param: str | None,
 ) -> "ScoreDict":
     """Log the scores.
@@ -34,6 +35,8 @@ def log_scores(
             The model ID of the model that was evaluated.
         model_revision:
             The revision of the model.
+        model_param:
+            The model parameter, if any.
     Returns:
         A dictionary with keys 'raw_scores' and 'total', with 'raw_scores' being
@@ -42,6 +45,8 @@ def log_scores(
     """
     if model_revision and model_revision != "main":
         model_id += f"@{model_revision}"
+    if model_param is not None:
+        model_id += f"#{model_param}"
     logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -10,7 +10,7 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.trainer import Trainer
 from ..exceptions import InvalidBenchmark
-from ..tokenization_utils import get_special_token_metadata
+from ..tokenisation_utils import get_special_token_metadata
 from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
@@ -261,7 +261,7 @@ def prepare_train_examples(
         ]
         examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
-    # Set the stride used during tokenization, when the context is long enough to be
+    # Set the stride used during tokenisation, when the context is long enough to be
     # split into several features. Since we are always keeping the question tokens, we
     # need to make sure that the stride does not exceed the resulting maximum context
     # length.
@@ -272,11 +272,11 @@ def prepare_train_examples(
     stride = min(stride, max_length - max_question_tokens - num_special_tokens)
     max_length = tokeniser.model_max_length - stride
-    # Tokenize our examples with truncation and padding, but keep the overflows using a
+    # Tokenise our examples with truncation and padding, but keep the overflows using a
     # stride. This results in one example possible giving several features when a
     # context is long, each of those features having a context that overlaps a bit the
     # context of the previous feature.
-    tokenized_examples = tokeniser(
+    tokenised_examples = tokeniser(
         text=examples["question"],
         text_pair=examples["context"],
         truncation="only_second",
@@ -290,27 +290,27 @@ def prepare_train_examples(
     # Since one example might give us several features if it has a long context, we
     # need a map from a feature to its corresponding example. This key gives us just
     # that
-    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+    sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
     # The offset mappings will give us a map from token to character position in the
     # original context. This will help us compute the start_positions and
     # end_positions.
-    offset_mapping = tokenized_examples.pop("offset_mapping")
+    offset_mapping = tokenised_examples.pop("offset_mapping")
     # Initialise the start- and end positions of the answers
-    tokenized_examples["start_positions"] = list()
-    tokenized_examples["end_positions"] = list()
+    tokenised_examples["start_positions"] = list()
+    tokenised_examples["end_positions"] = list()
     for i, offsets in enumerate(offset_mapping):
         # Get the input IDs for the current example
-        input_ids = tokenized_examples.input_ids[i]
+        input_ids = tokenised_examples.input_ids[i]
         # We will label impossible answers with the index of the CLS token
         cls_index = input_ids.index(cls_token_id)
         # Grab the sequence corresponding to that example (to know what is the context
         # and what is the question).
-        sequence_ids = tokenized_examples.sequence_ids(i)
+        sequence_ids = tokenised_examples.sequence_ids(i)
         # Manually ensure that the special tokens are set to None in `sequence_ids`
         for special_token in tokeniser.special_tokens_map.keys():
@@ -329,8 +329,8 @@ def prepare_train_examples(
         # If no answers are given, set the cls_index as answer.
         if len(answers["answer_start"]) == 0:
-            tokenized_examples.start_positions.append(cls_index)
-            tokenized_examples.end_positions.append(cls_index)
+            tokenised_examples.start_positions.append(cls_index)
+            tokenised_examples.end_positions.append(cls_index)
         else:
             # Start/end character index of the answer in the text.
@@ -353,8 +353,8 @@ def prepare_train_examples(
                 offsets[token_start_index][0] <= start_char
                 and offsets[token_end_index][1] >= end_char
             ):
-                tokenized_examples.start_positions.append(cls_index)
-                tokenized_examples.end_positions.append(cls_index)
+                tokenised_examples.start_positions.append(cls_index)
+                tokenised_examples.end_positions.append(cls_index)
             # Otherwise move the token_start_index and token_end_index to the two ends
             # of the answer. Note: we could go after the last offset if the answer is
@@ -366,17 +366,17 @@ def prepare_train_examples(
                 ):
                     token_start_index += 1
                 token_start_index -= 1
-                tokenized_examples.start_positions.append(token_start_index)
+                tokenised_examples.start_positions.append(token_start_index)
                 while (
                     token_start_index <= token_end_index
                     and offsets[token_end_index][1] >= end_char
                 ):
                     token_end_index -= 1
                 token_end_index += 1
-                tokenized_examples.end_positions.append(token_end_index)
+                tokenised_examples.end_positions.append(token_end_index)
                 assert token_end_index >= token_start_index
-    return tokenized_examples
+    return tokenised_examples
 def prepare_test_examples(
@@ -394,7 +394,7 @@ def prepare_test_examples(
         The prepared test examples.
     """
     # Some of the questions have lots of whitespace on the left, which is not useful
-    # and will make the truncation of the context fail (the tokenized question will
+    # and will make the truncation of the context fail (the tokenised question will
     # take a lots of space). So we remove that left whitespace
     examples["question"] = [q.lstrip() for q in examples["question"]]
@@ -412,7 +412,7 @@ def prepare_test_examples(
         ]
         examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
-    # Set the stride used during tokenization, when the context is long enough to be
+    # Set the stride used during tokenisation, when the context is long enough to be
     # split into several features. Since we are always keeping the question tokens, we
     # need to make sure that the stride does not exceed the resulting maximum context
     # length.
@@ -423,11 +423,11 @@ def prepare_test_examples(
     stride = min(stride, max_length - max_question_tokens - num_special_tokens)
     max_length = tokeniser.model_max_length - stride
-    # Tokenize our examples with truncation and maybe padding, but keep the overflows
+    # Tokenise our examples with truncation and maybe padding, but keep the overflows
     # using a stride. This results in one example possible giving several features when
     # a context is long, each of those features having a context that overlaps a bit
     # the context of the previous feature.
-    tokenized_examples = tokeniser(
+    tokenised_examples = tokeniser(
         text=examples["question"],
         text_pair=examples["context"],
         truncation="only_second",
@@ -441,30 +441,30 @@ def prepare_test_examples(
     # Since one example might give us several features if it has a long context, we
     # need a map from a feature to its corresponding example. This key gives us just
     # that.
-    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+    sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
     # We keep the id that gave us this feature and we will store the offset mappings.
-    tokenized_examples["id"] = list()
+    tokenised_examples["id"] = list()
-    for i in range(len(tokenized_examples.input_ids)):
+    for i in range(len(tokenised_examples.input_ids)):
         # Grab the sequence corresponding to that example (to know what is the context
         # and what is the question).
-        sequence_ids = tokenized_examples.sequence_ids(i)
+        sequence_ids = tokenised_examples.sequence_ids(i)
         context_index = 1
         # One example can give several spans, this is the index of the example
         # containing this span of text.
         sample_index = sample_mapping[i]
-        tokenized_examples.id.append(examples["id"][sample_index])
+        tokenised_examples.id.append(examples["id"][sample_index])
         # Set to (-1, -1) the offset_mapping that are not part of the context so it's
         # easy to determine if a token position is part of the context or not.
-        tokenized_examples.offset_mapping[i] = [
+        tokenised_examples.offset_mapping[i] = [
             (o if sequence_ids[k] == context_index else (-1, -1))
-            for k, o in enumerate(tokenized_examples.offset_mapping[i])
+            for k, o in enumerate(tokenised_examples.offset_mapping[i])
         ]
-    return tokenized_examples
+    return tokenised_examples
 def postprocess_predictions_and_labels(

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -198,8 +198,8 @@ def extract_labels_from_generation(
         # If no candidate labels were found, we either pick the label with the smallest
         # word edit distance to the predicted label (if invalid model outputs are
         # allowed), or we raise an error
-        if min(edit_distances) > 100:
-            if dataset_config.task.allow_invalid_model_outputs:
+        if min(edit_distances) >= 1000:
+            if dataset_config.allow_invalid_model_outputs:
                 logger.warning(
                     "No candidate labels found for the predicted label "
                     f"{predicted_label!r}, out of the candidate labels "
@@ -296,19 +296,7 @@ def get_closest_logprobs_labels(
                     candidate_output_labels = {
                         candidate_label
                         for candidate_label in candidate_labels[idx]
-                        if candidate_label.startswith(generated_label)
-                    }
-                # If the generated label is a numeral (e.g., "1", "2", "3") and there is
-                # a matching candidate label, we only keep the full match
-                if re.match(r"^\d+$", generated_label) and any(
-                    candidate_label == generated_label
-                    for candidate_label in candidate_output_labels
-                ):
-                    candidate_output_labels = {
-                        candidate_label
-                        for candidate_label in candidate_output_labels
-                        if candidate_label == generated_label
+                        if candidate_label.startswith(generated_label.strip())
                     }
                 # If we can uniquely determine the output label, we break the loop.
@@ -357,19 +345,6 @@ def get_closest_logprobs_labels(
                         )
                         return None
-            # If we did not find any candidate label for any of the generated labels, we
-            # assume that something is wrong with the model output, and we fall back to
-            # using word edit distance to extract the labels
-            else:
-                log_once(
-                    f"No candidate label found for any of the generated labels "
-                    f"{generated_labels}. This means that using logprobs to extract "
-                    "the labels is not reliable, and we will instead fall back to "
-                    "extracting the labels using word edit distance.",
-                    level=logging.DEBUG,
-                )
-                return None
             if output_label is not None:
                 output_labels.append(output_label)
                 break
@@ -377,18 +352,20 @@ def get_closest_logprobs_labels(
             if len(sample) == 0:
                 log_once(
                     "The model outputted an empty string, so no candidate labels could "
-                    "be determined. Using the first label, "
-                    f"{candidate_labels[idx][0]!r}, as the output label.",
+                    "be determined. This means that using logprobs to extract the "
+                    "labels is not reliable, and we will instead fall back to "
+                    "extracting the labels using word edit distance.",
                     level=logging.INFO,
                 )
             else:
                 log_once(
-                    "Could not find a candidate label for any of the generated "
-                    f"labels in the sample {sample}. Using the first label, "
-                    f"{candidate_labels[idx][0]!r}, as the output label.",
+                    "No candidate label found for any of the generated labels, which "
+                    "means that using logprobs to extract the labels is not reliable, "
+                    "and we will instead fall back to extracting the labels using "
+                    "word edit distance.",
                     level=logging.INFO,
                 )
-            output_labels.append(candidate_labels[idx][0])
+            return None
     assert len(output_labels) == len(generation_logprobs)
     return output_labels

euroeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -273,7 +273,7 @@ def tokenize_and_align_labels(
     Returns:
         A dictionary containing the tokenized data as well as labels.
     """
-    # Tokenize the texts. We use the `is_split_into_words` argument here because
+    # Tokenise the texts. We use the `is_split_into_words` argument here because
     # the texts in our dataset are lists of words (with a label for each word)
     tokenized_inputs = tokeniser(
         examples["tokens"], is_split_into_words=True, truncation=True, padding=True
@@ -396,7 +396,7 @@ def handle_unk_tokens(
     Args:
         tokeniser:
-            The tokeniser used to tokenize the words.
+            The tokeniser used to tokenise the words.
         tokens:
             The list of tokens.
         words:
@@ -423,7 +423,7 @@ def handle_unk_tokens(
         # Fetch the word
         word = words[word_idx]
-        # Tokenize the word, which is now a list containing at least one UNK token
+        # Tokenise the word, which is now a list containing at least one UNK token
         tokens_with_unk = tokeniser.convert_ids_to_tokens(
             tokeniser.encode(word, add_special_tokens=False)
         )

euroeval/tasks.py CHANGED Viewed

@@ -88,7 +88,7 @@ SUMM = Task(
     default_num_few_shot_examples=1,
     default_max_generated_tokens=256,
     default_labels=[],
-    allowed_model_types=[ModelType.GENERATIVE],
+    default_allowed_model_types=[ModelType.GENERATIVE],
 )
@@ -136,14 +136,14 @@ EUROPEAN_VALUES = Task(
     default_num_few_shot_examples=0,
     default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
-    allowed_model_types=[ModelType.GENERATIVE],
-    allowed_generative_types=[
+    default_allowed_model_types=[ModelType.GENERATIVE],
+    default_allowed_generative_types=[
         GenerativeType.INSTRUCTION_TUNED,
         GenerativeType.REASONING,
     ],
     requires_zero_shot=True,
     uses_logprobs=True,
-    allow_invalid_model_outputs=False,
+    default_allow_invalid_model_outputs=False,
 )

EuroEval 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl

Potentially problematic release.

EuroEval 16.0.1py3-none-any.whl → 16.1.1py3-none-any.whl