PyPI - EuroEval - Versions diffs - 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl - Mend

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

euroeval/__init__.py +32 -14
euroeval/benchmark_config_factory.py +92 -180
euroeval/benchmark_modules/base.py +49 -39
euroeval/benchmark_modules/fresh.py +35 -21
euroeval/benchmark_modules/hf.py +280 -244
euroeval/benchmark_modules/litellm.py +752 -312
euroeval/benchmark_modules/vllm.py +570 -268
euroeval/benchmarker.py +651 -528
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +49 -38
euroeval/constants.py +44 -25
euroeval/data_loading.py +111 -55
euroeval/data_models.py +490 -323
euroeval/dataset_configs/__init__.py +26 -4
euroeval/dataset_configs/bosnian.py +39 -0
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/croatian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +78 -50
euroeval/dataset_configs/dutch.py +74 -44
euroeval/dataset_configs/english.py +71 -36
euroeval/dataset_configs/estonian.py +111 -0
euroeval/dataset_configs/faroese.py +25 -18
euroeval/dataset_configs/finnish.py +63 -26
euroeval/dataset_configs/french.py +65 -32
euroeval/dataset_configs/german.py +77 -36
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +68 -57
euroeval/dataset_configs/italian.py +68 -36
euroeval/dataset_configs/latvian.py +87 -0
euroeval/dataset_configs/lithuanian.py +64 -0
euroeval/dataset_configs/norwegian.py +98 -72
euroeval/dataset_configs/polish.py +96 -0
euroeval/dataset_configs/portuguese.py +63 -40
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/slovene.py +56 -0
euroeval/dataset_configs/spanish.py +68 -34
euroeval/dataset_configs/swedish.py +82 -41
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/enums.py +12 -6
euroeval/exceptions.py +21 -1
euroeval/finetuning.py +34 -26
euroeval/generation.py +76 -41
euroeval/generation_utils.py +169 -34
euroeval/languages.py +1020 -188
euroeval/logging_utils.py +268 -0
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +85 -0
euroeval/metrics/huggingface.py +216 -0
euroeval/metrics/llm_as_a_judge.py +260 -0
euroeval/metrics/pipeline.py +289 -0
euroeval/metrics/speed.py +48 -0
euroeval/model_cache.py +40 -21
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +157 -22
euroeval/prompt_templates/multiple_choice.py +159 -17
euroeval/prompt_templates/named_entity_recognition.py +318 -21
euroeval/prompt_templates/reading_comprehension.py +207 -16
euroeval/prompt_templates/sentiment_classification.py +205 -22
euroeval/prompt_templates/summarization.py +122 -22
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +20 -9
euroeval/speed_benchmark.py +11 -12
euroeval/task_group_utils/multiple_choice_classification.py +21 -12
euroeval/task_group_utils/question_answering.py +101 -73
euroeval/task_group_utils/sequence_classification.py +144 -61
euroeval/task_group_utils/text_to_text.py +33 -12
euroeval/task_group_utils/token_classification.py +86 -89
euroeval/tasks.py +75 -16
euroeval/tokenisation_utils.py +603 -0
euroeval/types.py +17 -11
euroeval/utils.py +332 -137
euroeval-16.7.1.dist-info/METADATA +623 -0
euroeval-16.7.1.dist-info/RECORD +84 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -737
euroeval/metrics.py +0 -452
euroeval/tokenization_utils.py +0 -498
euroeval-15.12.0.dist-info/METADATA +0 -285
euroeval-15.12.0.dist-info/RECORD +0 -63
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -1,32 +1,35 @@
 """Utility functions related to the token-classification task group."""
+import collections.abc as c
 import logging
-import re
 import typing as t
 from copy import deepcopy
-import demjson3
 import numpy as np
 from ..exceptions import InvalidBenchmark
-from ..utils import raise_if_model_output_contains_nan_values
+from ..logging_utils import log
+from ..utils import (
+    extract_json_dict_from_string,
+    raise_if_model_output_contains_nan_values,
+)
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from transformers.tokenization_utils import PreTrainedTokenizer
     from transformers.tokenization_utils_base import BatchEncoding
     from transformers.trainer_utils import EvalPrediction
-    from ..data_models import DatasetConfig, GenerativeModelOutput
+    from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     has_misc_tags: bool,
     dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
+    dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -38,6 +41,11 @@ def compute_metrics(
             Whether the dataset has MISC tags.
         dataset_config:
             The configuration of the dataset.
+        benchmark_config:
+            The configuration of the benchmark.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -52,7 +60,9 @@ def compute_metrics(
     predictions: list[list[str]]
     if not isinstance(model_outputs[0][0], str):
-        raw_predictions: list[list[int]] = np.argmax(model_outputs, axis=-1).tolist()
+        raw_predictions: c.Sequence[c.Sequence[int]] = np.argmax(
+            model_outputs, axis=-1
+        ).tolist()
         # Remove ignored index (special tokens)
         predictions = [
@@ -136,7 +146,13 @@ def compute_metrics(
             for metric in dataset_config.task.metrics
             if metric.name == "micro_f1"
         )
-        micro_f1_score = metric(predictions=predictions, references=list(labels))
+        micro_f1_score = metric(
+            predictions=predictions,
+            references=list(labels),
+            dataset=dataset,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
+        )
     # Compute the metrics without MISC tags
     # We manually set the F1 metric to be 100% if both the labels and the models
@@ -158,7 +174,11 @@ def compute_metrics(
             if metric.name == "micro_f1_no_misc"
         )
         micro_f1_no_misc_score = metric(
-            predictions=predictions_no_misc, references=labels_no_misc
+            predictions=predictions_no_misc,
+            references=labels_no_misc,
+            dataset=dataset,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
         )
     # Raise error if the metrics are invalid
@@ -172,7 +192,7 @@ def extract_labels_from_generation(
     input_batch: dict[str, list],
     model_output: "GenerativeModelOutput",
     dataset_config: "DatasetConfig",
-) -> list[t.Any]:
+) -> c.Sequence[t.Any]:
     """Extract the predicted labels from the generated output.
     Args:
@@ -187,55 +207,31 @@ def extract_labels_from_generation(
     Returns:
         The predicted labels.
     """
-    raw_predictions = model_output.sequences
-    # Attempt to extract the JSON dictionary from the predictions
-    json_regex = r"\{[^{}]+?\}"
-    json_matches = [
-        re.search(pattern=json_regex, string=raw_prediction, flags=re.DOTALL)
-        or raw_prediction
-        for raw_prediction in raw_predictions
-    ]
-    raw_predictions = [
-        json_match.group() if isinstance(json_match, re.Match) else json_match
-        for json_match in json_matches
-    ]
     tokens = input_batch["tokens"]
     predicted_labels: list[list[str]] = [["o"] * len(token_ids) for token_ids in tokens]
-    for idx, raw_prediction in enumerate(raw_predictions):
-        try:
-            json_output = demjson3.decode(txt=raw_prediction)
-            if not isinstance(json_output, dict):
-                logger.debug(
-                    "The model output is not a JSON dictionary, so cannot parse "
-                    f"it. Skipping. Here is the output: {raw_prediction}"
-                )
-                continue
-            elif not all(isinstance(key, str) for key in json_output.keys()):
-                logger.debug(
-                    "The model output is not a JSON dictionary with string keys, "
-                    "so cannot parse it. Skipping. Here is the output: "
-                    f"{raw_prediction}"
-                )
-                continue
-            elif not all(isinstance(value, list) for value in json_output.values()):
-                logger.debug(
-                    "The model output is not a JSON dictionary with list values, "
-                    "so cannot parse it. Skipping. Here is the output: "
-                    f"{raw_prediction}"
-                )
-                continue
-            prediction_dict: dict[str, list[str]] = json_output
-        except demjson3.JSONDecodeError:
-            logger.debug(
-                "The model output is not valid JSON, so cannot parse it. Skipping. "
-                f"Here is the output: {raw_prediction!r}"
-            )
+    for idx, raw_prediction in enumerate(model_output.sequences):
+        prediction_dict = extract_json_dict_from_string(s=raw_prediction)
+        if prediction_dict is None:
             continue
         prompt_label_mapping = dataset_config.prompt_label_mapping
         for prompt_tag_name, named_entities in prediction_dict.items():
+            if not isinstance(named_entities, list):
+                log(
+                    "The model produced an invalid format for the named entities. "
+                    f"Expected a list but got {type(named_entities)}. Skipping.",
+                    level=logging.DEBUG,
+                )
+                continue
+            try:
+                named_entities = [str(ne) for ne in named_entities]
+            except Exception:
+                log(
+                    "The model produced an invalid format for the named entities. "
+                    f"Expected a list of strings but got {named_entities}. Skipping.",
+                    level=logging.DEBUG,
+                )
+                continue
             try:
                 tag_name = [
                     tag[2:]
@@ -243,9 +239,10 @@ def extract_labels_from_generation(
                     if prompt_tag == prompt_tag_name
                 ][0]
             except IndexError:
-                logger.debug(
+                log(
                     "The model produced an invalid prompt tag name, "
-                    f"{prompt_tag_name}. Skipping."
+                    f"{prompt_tag_name}. Skipping.",
+                    level=logging.DEBUG,
                 )
                 continue
@@ -265,49 +262,49 @@ def extract_labels_from_generation(
 def tokenize_and_align_labels(
-    examples: dict, tokenizer: "PreTrainedTokenizer", label2id: dict[str, int]
+    examples: dict, tokeniser: "PreTrainedTokenizer", label2id: dict[str, int]
 ) -> "BatchEncoding":
     """Tokenise all texts and align the labels with them.
     Args:
         examples:
             The examples to be tokenised.
-        tokenizer:
-            A pretrained tokenizer.
+        tokeniser:
+            A pretrained tokeniser.
         label2id:
             A dictionary that converts NER tags to IDs.
     Returns:
         A dictionary containing the tokenized data as well as labels.
     """
-    # Tokenize the texts. We use the `is_split_into_words` argument here because
+    # Tokenise the texts. We use the `is_split_into_words` argument here because
     # the texts in our dataset are lists of words (with a label for each word)
-    tokenized_inputs = tokenizer(
+    tokenized_inputs = tokeniser(
         examples["tokens"], is_split_into_words=True, truncation=True, padding=True
     )
     # Extract a mapping between all the tokens and their corresponding word. If the
-    # tokenizer is of a "fast" variant then this can be accessed through the
+    # tokeniser is of a "fast" variant then this can be accessed through the
     # `word_ids` method. Otherwise, we have to extract it manually.
     all_labels: list[list[int]] = list()
-    labels: list[str]
-    word_ids: list[int | None]
+    labels: c.Sequence[str]
+    word_ids: c.Sequence[int | None]
     for i, labels in enumerate(examples["labels"]):
-        # Try to get the word IDs from the tokenizer
+        # Try to get the word IDs from the tokeniser
         try:
             word_ids = tokenized_inputs.word_ids(batch_index=i)
-        # If the tokenizer is not of a "fast" variant, we have to extract the word
+        # If the tokeniser is not of a "fast" variant, we have to extract the word
         # IDs manually
         except ValueError:
             # Get the list of words in the document
-            words: list[str] = examples["tokens"][i]
+            words: c.Sequence[str] = examples["tokens"][i]
             # Get the list of token IDs in the document
-            tok_ids: list[int] = tokenized_inputs.input_ids[i]
+            tok_ids: c.Sequence[int] = tokenized_inputs.input_ids[i]
             # Decode the token IDs
-            tokens = tokenizer.convert_ids_to_tokens(tok_ids)
+            tokens = tokeniser.convert_ids_to_tokens(tok_ids)
             assert isinstance(tokens, list)
             # Remove prefixes from the tokens
@@ -319,14 +316,14 @@ def tokenize_and_align_labels(
                             tokens[tok_idx] = tok[len(prefix) :]
             # Replace UNK tokens with the correct word
-            tokens = handle_unk_tokens(tokenizer=tokenizer, tokens=tokens, words=words)
+            tokens = handle_unk_tokens(tokeniser=tokeniser, tokens=tokens, words=words)
-            # Get list of special tokens. Some tokenizers do not record these
+            # Get list of special tokens. Some tokenisers do not record these
             # properly, which is why we convert the values to their indices and
             # then back to strings
             sp_toks = [
-                tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids(sp_tok))
-                for sp_tok in tokenizer.special_tokens_map.values()
+                tokeniser.convert_ids_to_tokens(tokeniser.convert_tokens_to_ids(sp_tok))
+                for sp_tok in tokeniser.special_tokens_map.values()
             ]
             # Replace special tokens with `None`
@@ -350,7 +347,7 @@ def tokenize_and_align_labels(
             if len(word_idxs) != len(token_idxs):
                 raise InvalidBenchmark(
                     "The tokens could not be aligned with the words during manual "
-                    "word-token alignment. It seems that the tokenizer is neither "
+                    "word-token alignment. It seems that the tokeniser is neither "
                     "of the fast variant nor of a SentencePiece/WordPiece variant."
                 )
@@ -380,9 +377,9 @@ def tokenize_and_align_labels(
                 label = labels[word_id]
                 try:
                     label_id = label2id[label.lower()]
-                except KeyError:
+                except KeyError as e:
                     msg = f"The label {label} was not found in the model's config."
-                    raise InvalidBenchmark(msg)
+                    raise InvalidBenchmark(msg) from e
                 label_ids.append(label_id)
             # For the other tokens in a word, we set the label to -100
@@ -397,13 +394,13 @@ def tokenize_and_align_labels(
 def handle_unk_tokens(
-    tokenizer: "PreTrainedTokenizer", tokens: list[str], words: list[str]
-) -> list[str]:
+    tokeniser: "PreTrainedTokenizer", tokens: list[str], words: c.Sequence[str]
+) -> c.Sequence[str]:
     """Replace unknown tokens in the tokens with the corresponding word.
     Args:
-        tokenizer:
-            The tokenizer used to tokenize the words.
+        tokeniser:
+            The tokeniser used to tokenise the words.
         tokens:
             The list of tokens.
         words:
@@ -413,15 +410,15 @@ def handle_unk_tokens(
         The list of tokens with unknown tokens replaced by the corresponding word.
     """
     # Locate the token indices of the unknown tokens
-    token_unk_idxs = [i for i, tok in enumerate(tokens) if tok == tokenizer.unk_token]
+    token_unk_idxs = [i for i, tok in enumerate(tokens) if tok == tokeniser.unk_token]
     # Locate the word indices of the words which contain an unknown token
     word_unk_idxs = [
         i
         for i, word in enumerate(words)
-        if tokenizer.unk_token
-        in tokenizer.convert_ids_to_tokens(
-            tokenizer.encode(word, add_special_tokens=False)
+        if tokeniser.unk_token
+        in tokeniser.convert_ids_to_tokens(
+            tokeniser.encode(word, add_special_tokens=False)
         )
     ]
@@ -430,9 +427,9 @@ def handle_unk_tokens(
         # Fetch the word
         word = words[word_idx]
-        # Tokenize the word, which is now a list containing at least one UNK token
-        tokens_with_unk = tokenizer.convert_ids_to_tokens(
-            tokenizer.encode(word, add_special_tokens=False)
+        # Tokenise the word, which is now a list containing at least one UNK token
+        tokens_with_unk = tokeniser.convert_ids_to_tokens(
+            tokeniser.encode(word, add_special_tokens=False)
         )
         # Iterate over the tokens in the word
@@ -441,10 +438,10 @@ def handle_unk_tokens(
             # of the content of this token from the word. The result of the `word`
             # variable will be the content of the UNK token.
             # NOTE: This is a bit hacky and not bulletproof. For instance, if the
-            # word is "1925-1950" and the tokenizer splits it into ["[UNK]", "-",
+            # word is "1925-1950" and the tokeniser splits it into ["[UNK]", "-",
             # "19", "50"], then the result will be 2519 instead of 1925. This
             # happens almost never, however, so we can live with it.
-            if possible_unk_token != tokenizer.unk_token:
+            if possible_unk_token != tokeniser.unk_token:
                 word = word.replace(possible_unk_token, "", 1)
         # Replace the token with the word

euroeval/tasks.py CHANGED Viewed

@@ -1,35 +1,29 @@
 """All benchmarks tasks used in EuroEval."""
 from . import metrics as m
+from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
 from .data_models import Task
-from .enums import TaskGroup
+from .enums import GenerativeType, ModelType, TaskGroup
 from .prompt_templates import (
+    CLASSIFICATION_TEMPLATES,
     LA_TEMPLATES,
     MULTIPLE_CHOICE_TEMPLATES,
     NER_TEMPLATES,
     RC_TEMPLATES,
     SENT_TEMPLATES,
     SUMM_TEMPLATES,
+    TOKEN_CLASSIFICATION_TEMPLATES,
 )
-def get_all_tasks() -> dict[str, Task]:
-    """Get a list of all the dataset tasks.
-    Returns:
-        A mapping between names of dataset tasks and their configurations.
-    """
-    return {cfg.name: cfg for cfg in globals().values() if isinstance(cfg, Task)}
 LA = Task(
     name="linguistic-acceptability",
     task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
     template_dict=LA_TEMPLATES,
     metrics=[m.mcc_metric, m.macro_f1_metric],
     default_num_few_shot_examples=12,
-    default_max_generated_tokens=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["correct", "incorrect"],
+    uses_logprobs=True,
 )
@@ -51,6 +45,7 @@ NER = Task(
         "b-misc",
         "i-misc",
     ],
+    uses_structured_output=True,
 )
@@ -71,8 +66,9 @@ SENT = Task(
     template_dict=SENT_TEMPLATES,
     metrics=[m.mcc_metric, m.macro_f1_metric],
     default_num_few_shot_examples=12,
-    default_max_generated_tokens=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["positive", "neutral", "negative"],
+    uses_logprobs=True,
 )
@@ -84,6 +80,7 @@ SUMM = Task(
     default_num_few_shot_examples=1,
     default_max_generated_tokens=256,
     default_labels=[],
+    default_allowed_model_types=[ModelType.GENERATIVE],
 )
@@ -93,8 +90,10 @@ KNOW = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.mcc_metric, m.accuracy_metric],
     default_num_few_shot_examples=5,
-    default_max_generated_tokens=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
+    default_allowed_model_types=[ModelType.GENERATIVE],
+    uses_logprobs=True,
 )
@@ -104,8 +103,10 @@ MCRC = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.mcc_metric, m.accuracy_metric],
     default_num_few_shot_examples=5,
-    default_max_generated_tokens=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
+    default_allowed_model_types=[ModelType.GENERATIVE],
+    uses_logprobs=True,
 )
@@ -115,8 +116,29 @@ COMMON_SENSE = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.mcc_metric, m.accuracy_metric],
     default_num_few_shot_examples=5,
-    default_max_generated_tokens=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
+    default_allowed_model_types=[ModelType.GENERATIVE],
+    uses_logprobs=True,
+)
+EUROPEAN_VALUES = Task(
+    name="european-values",
+    task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
+    template_dict=MULTIPLE_CHOICE_TEMPLATES,
+    metrics=[m.european_values_metric],
+    default_num_few_shot_examples=0,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
+    default_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
+    default_allowed_model_types=[ModelType.GENERATIVE],
+    default_allowed_generative_types=[
+        GenerativeType.INSTRUCTION_TUNED,
+        GenerativeType.REASONING,
+    ],
+    requires_zero_shot=True,
+    uses_logprobs=True,
+    default_allow_invalid_model_outputs=False,
 )
@@ -129,3 +151,40 @@ SPEED = Task(
     default_max_generated_tokens=5,
     default_labels=[],
 )
+# Used for custom datasets
+TEXT_CLASSIFICATION = Task(
+    name="classification",
+    task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
+    template_dict=CLASSIFICATION_TEMPLATES,
+    metrics=[m.mcc_metric, m.macro_f1_metric],
+    default_num_few_shot_examples=12,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
+    default_labels=None,
+    uses_logprobs=True,
+)
+TOKEN_CLASSIFICATION = Task(
+    name="token-classification",
+    task_group=TaskGroup.TOKEN_CLASSIFICATION,
+    template_dict=TOKEN_CLASSIFICATION_TEMPLATES,
+    metrics=[m.micro_f1_metric],
+    default_num_few_shot_examples=8,
+    default_max_generated_tokens=128,
+    default_labels=None,
+    uses_structured_output=True,
+)
+MULTIPLE_CHOICE = Task(
+    name="multiple-choice",
+    task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
+    template_dict=MULTIPLE_CHOICE_TEMPLATES,
+    metrics=[m.mcc_metric, m.accuracy_metric],
+    default_num_few_shot_examples=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
+    default_labels=None,
+    default_allowed_model_types=[ModelType.GENERATIVE],
+    uses_logprobs=True,
+)

EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl