PyPI - EuroEval - Versions diffs - 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl - Mend

EuroEval 16.3.0py3-none-any.whl → 16.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (78) hide show

euroeval/__init__.py +9 -2
euroeval/benchmark_config_factory.py +51 -50
euroeval/benchmark_modules/base.py +9 -21
euroeval/benchmark_modules/fresh.py +2 -1
euroeval/benchmark_modules/hf.py +101 -71
euroeval/benchmark_modules/litellm.py +115 -53
euroeval/benchmark_modules/vllm.py +107 -92
euroeval/benchmarker.py +144 -121
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +86 -8
euroeval/constants.py +9 -0
euroeval/data_loading.py +80 -29
euroeval/data_models.py +338 -330
euroeval/dataset_configs/__init__.py +12 -3
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +55 -93
euroeval/dataset_configs/dutch.py +48 -87
euroeval/dataset_configs/english.py +45 -77
euroeval/dataset_configs/estonian.py +42 -34
euroeval/dataset_configs/faroese.py +19 -60
euroeval/dataset_configs/finnish.py +36 -69
euroeval/dataset_configs/french.py +39 -75
euroeval/dataset_configs/german.py +45 -82
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +54 -91
euroeval/dataset_configs/italian.py +42 -79
euroeval/dataset_configs/latvian.py +28 -35
euroeval/dataset_configs/lithuanian.py +28 -26
euroeval/dataset_configs/norwegian.py +72 -115
euroeval/dataset_configs/polish.py +33 -61
euroeval/dataset_configs/portuguese.py +33 -66
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/spanish.py +42 -77
euroeval/dataset_configs/swedish.py +52 -90
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/exceptions.py +1 -1
euroeval/finetuning.py +24 -17
euroeval/generation.py +15 -14
euroeval/generation_utils.py +8 -8
euroeval/languages.py +395 -323
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +21 -6
euroeval/metrics/llm_as_a_judge.py +6 -4
euroeval/metrics/pipeline.py +17 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +17 -19
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +99 -42
euroeval/prompt_templates/multiple_choice.py +102 -38
euroeval/prompt_templates/named_entity_recognition.py +172 -51
euroeval/prompt_templates/reading_comprehension.py +119 -42
euroeval/prompt_templates/sentiment_classification.py +110 -40
euroeval/prompt_templates/summarization.py +85 -40
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +11 -10
euroeval/speed_benchmark.py +5 -6
euroeval/task_group_utils/multiple_choice_classification.py +2 -4
euroeval/task_group_utils/question_answering.py +24 -16
euroeval/task_group_utils/sequence_classification.py +48 -35
euroeval/task_group_utils/text_to_text.py +19 -9
euroeval/task_group_utils/token_classification.py +21 -17
euroeval/tasks.py +44 -1
euroeval/tokenisation_utils.py +33 -22
euroeval/types.py +10 -9
euroeval/utils.py +35 -149
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
euroeval-16.5.0.dist-info/RECORD +81 -0
euroeval-16.3.0.dist-info/RECORD +0 -71
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Utility functions related to the question-answering task group."""
 import collections.abc as c
-import logging
 import typing as t
 from collections import defaultdict
@@ -26,8 +25,6 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 class QuestionAnsweringTrainer(Trainer):
     """Trainer subclass for question answering tasks."""
@@ -40,7 +37,7 @@ class QuestionAnsweringTrainer(Trainer):
         train_dataset: "Dataset",
         eval_dataset: "Dataset",
         compute_metrics: "c.Callable[[EvalPrediction], dict[str, float]]",
-        callbacks: "list[TrainerCallback]",
+        callbacks: "c.Sequence[TrainerCallback]",
         data_collator: "c.Callable",
         **kwargs,
     ) -> None:
@@ -70,7 +67,7 @@ class QuestionAnsweringTrainer(Trainer):
         self,
         eval_dataset: "Dataset | None" = None,
         orig_eval_dataset: "Dataset | None" = None,
-        ignore_keys: list[str] | None = None,
+        ignore_keys: c.Sequence[str] | None = None,
         metric_key_prefix: str = "eval",
     ) -> dict[str, float]:
         """Evaluate the model on the given dataset.
@@ -206,7 +203,7 @@ def compute_metrics(
 def extract_labels_from_generation(
     input_batch: dict[str, list], model_output: "GenerativeModelOutput"
-) -> list[t.Any]:
+) -> c.Sequence[t.Any]:
     """Extract the predicted labels from the generated output.
     Args:
@@ -268,8 +265,11 @@ def prepare_train_examples(
     max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
     num_special_tokens = int(has_cls_token) + int(has_sep_token)
     stride = tokeniser.model_max_length // 4
-    max_length = tokeniser.model_max_length - stride
-    stride = min(stride, max_length - max_question_tokens - num_special_tokens)
+    stride = min(
+        stride,
+        tokeniser.model_max_length - stride - max_question_tokens - num_special_tokens,
+    )
+    stride = max(stride, 0)
     max_length = tokeniser.model_max_length - stride
     # Tokenise our examples with truncation and padding, but keep the overflows using a
@@ -338,9 +338,17 @@ def prepare_train_examples(
             end_char = start_char + len(answers["text"][0])
             # Start token index of the current span in the text.
-            token_start_index = 0
-            while sequence_ids[token_start_index] != 1:
-                token_start_index += 1
+            try:
+                token_start_index = 0
+                while sequence_ids[token_start_index] != 1:
+                    token_start_index += 1
+            # If it turns out that we cannot find the context in the span, then we
+            # treat this as an impossible case
+            except IndexError:
+                tokenised_examples.start_positions.append(cls_index)
+                tokenised_examples.end_positions.append(cls_index)
+                continue
             # End token index of the current span in the text.
             token_end_index = len(input_ids) - 1
@@ -472,7 +480,7 @@ def postprocess_predictions_and_labels(
     dataset: "Dataset",
     prepared_dataset: "Dataset",
     cls_token_index: int,
-) -> tuple[list[dict], list[dict]]:
+) -> tuple[c.Sequence[dict], c.Sequence[dict]]:
     """Postprocess the predictions and labels, to allow easier metric computation.
     Args:
@@ -553,7 +561,7 @@ def find_best_answer(
     all_start_logits: np.ndarray,
     all_end_logits: np.ndarray,
     prepared_dataset: "Dataset",
-    feature_indices: list[int],
+    feature_indices: c.Sequence[int],
     context: str,
     max_answer_length: int,
     num_best_logits: int,
@@ -586,7 +594,7 @@ def find_best_answer(
         The best answer for the example.
     """
     # Loop through all the features associated to the current example
-    valid_answers = list()
+    valid_answers: list[dict] = list()
     for feature_index in feature_indices:
         # Get the features associated with the current example
         features = prepared_dataset[feature_index]
@@ -627,12 +635,12 @@ def find_best_answer(
 def find_valid_answers(
     start_logits: np.ndarray,
     end_logits: np.ndarray,
-    offset_mapping: list[tuple[int, int]],
+    offset_mapping: c.Sequence[tuple[int, int]],
     context: str,
     max_answer_length: int,
     num_best_logits: int,
     min_null_score: float,
-) -> list[dict]:
+) -> c.Sequence[dict]:
     """Find the valid answers from the start and end indexes.
     Args:

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Utility functions related to the sequence-classification task group."""
+import collections.abc as c
 import logging
 import re
 import typing as t
@@ -19,13 +20,15 @@ if t.TYPE_CHECKING:
     from datasets.arrow_dataset import Dataset
     from transformers.trainer_utils import EvalPrediction
-    from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
+    from ..data_models import (
+        BenchmarkConfig,
+        DatasetConfig,
+        GenerativeModelOutput,
+        ModelConfig,
+    )
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
@@ -106,8 +109,9 @@ def extract_labels_from_generation(
     input_batch: dict[str, list],
     model_output: "GenerativeModelOutput",
     dataset_config: "DatasetConfig",
+    model_config: "ModelConfig",
     first_label_token_mapping: dict[str, str] | bool,
-) -> list[str]:
+) -> c.Sequence[str]:
     """Extract the predicted labels from the generated output.
     Args:
@@ -118,6 +122,8 @@ def extract_labels_from_generation(
             The raw generated output of the model.
         dataset_config:
             The configuration of the dataset.
+        model_config:
+            The configuration of the model.
         first_label_token_mapping:
             A mapping from labels to the first token in each label, or alternatively a
             Boolean value indicating whether the model should output scores (if the
@@ -167,6 +173,7 @@ def extract_labels_from_generation(
             )
     new_predicted_labels: list[str] = list()
+    num_predictions_being_very_off = 0
     for idx, predicted_label in enumerate(model_output.sequences):
         # If the prediction includes a boxed answer, use that instead of the full
         # generation
@@ -199,42 +206,48 @@ def extract_labels_from_generation(
         # word edit distance to the predicted label (if invalid model outputs are
         # allowed), or we raise an error
         if min(edit_distances) >= 1000:
-            if dataset_config.allow_invalid_model_outputs:
-                logger.warning(
-                    "No candidate labels found for the predicted label "
-                    f"{predicted_label!r}, out of the candidate labels "
-                    f"{sample_candidate_labels[idx]}. This likely means that the model "
-                    "output is completely off, but since invalid model outputs are "
-                    "allowed for this task, we will use the closest candidate label "
-                    f"({best_candidate_label})) as the output label. If you see this "
-                    "warning very often, please report this issue to the EuroEval "
-                    "team at github.com/EuroEval/EuroEval/issues."
-                )
-                logger.debug(
-                    "The candidate labels were extracted from the prompt: "
-                    f"{input_batch['text'][idx]!r}."
-                )
-            else:
-                raise InvalidBenchmark(
-                    "No candidate labels found for the predicted label "
-                    f"{predicted_label!r}, out of the candidate labels "
-                    f"{sample_candidate_labels[idx]}. This likely means that the model "
-                    "output is completely off, and we cannot extract any labels from "
-                    "it. Please check the model output and the candidate labels. The "
-                    "candidate labels were extracted from the prompt: "
-                    f"{input_batch['text'][idx]!r}."
-                )
+            num_predictions_being_very_off += 1
         new_predicted_labels.append(best_candidate_label)
+    if num_predictions_being_very_off > 0:
+        if dataset_config.allow_invalid_model_outputs:
+            log_msg = (
+                "No candidate labels found for the predicted label in "
+                f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
+                f"of the samples with the model {model_config.model_id!r}. This "
+                "likely means that the model were completely off in these cases, "
+                "but since invalid model outputs are allowed for this task, we used "
+                "the closest candidate labels as the output labels."
+            )
+            level = logging.DEBUG
+            if num_predictions_being_very_off / len(model_output.sequences) > 0.5:
+                log_msg += (
+                    " Since this happened for most of the model's predictions, please "
+                    "report this issue to the EuroEval team at "
+                    "github.com/EuroEval/EuroEval/issues."
+                )
+                level = logging.WARNING
+            log_once(log_msg, level=level)
+        else:
+            raise InvalidBenchmark(
+                "No candidate labels found for the predicted label in "
+                f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
+                "of the samples. This likely means that the model were completely "
+                "off in these cases. Since this task does not allow invalid model "
+                "outputs, we have to abort the evaluation. Please re-run the "
+                "evaluation with the `--debug` flag (or `debug=True` if you're using "
+                "the `Benchmarker` API) to see the precise model outputs."
+            )
     return new_predicted_labels
 def get_closest_logprobs_labels(
-    generation_logprobs: list[list[list[tuple[str, float]]]],
+    generation_logprobs: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]],
     first_label_token_mapping: dict[str, str] | t.Literal[True],
-    candidate_labels: list[list[str]],
-) -> list[str] | None:
+    candidate_labels: c.Sequence[c.Sequence[str]],
+) -> c.Sequence[str] | None:
     """Get the labels with the highest predicted logprob value.
     In case a candidate label is split into multiple tokens, we only use the first
@@ -355,7 +368,7 @@ def get_closest_logprobs_labels(
                     "be determined. This means that using logprobs to extract the "
                     "labels is not reliable, and we will instead fall back to "
                     "extracting the labels using word edit distance.",
-                    level=logging.INFO,
+                    level=logging.DEBUG,
                 )
             else:
                 log_once(
@@ -363,7 +376,7 @@ def get_closest_logprobs_labels(
                     "means that using logprobs to extract the labels is not reliable, "
                     "and we will instead fall back to extracting the labels using "
                     "word edit distance.",
-                    level=logging.INFO,
+                    level=logging.DEBUG,
                 )
             return None

euroeval/task_group_utils/text_to_text.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Utility functions related to the text-to-text task group."""
+import collections.abc as c
 import logging
 import typing as t
@@ -7,6 +8,7 @@ import numpy as np
 from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
 from ..exceptions import InvalidBenchmark
+from ..logging_utils import log
 from ..metrics import HuggingFaceMetric
 from ..utils import raise_if_model_output_contains_nan_values
@@ -18,9 +20,6 @@ if t.TYPE_CHECKING:
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
@@ -44,6 +43,10 @@ def compute_metrics(
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
         values.
+    Raises:
+        InvalidBenchmark:
+            If the metric computation fails.
     """
     model_outputs, labels = model_outputs_and_labels
@@ -72,7 +75,7 @@ def compute_metrics(
         ):
             metric.compute_kwargs["device"] = benchmark_config.device.type
-        while True:
+        for _ in range(num_attempts := 5):
             try:
                 score: float | None = metric(
                     predictions=predictions,
@@ -96,21 +99,28 @@ def compute_metrics(
                     and metric.compute_kwargs.get("device", "cpu") != "cpu"
                 ):
                     metric.compute_kwargs["device"] = "cpu"
-                    logger.debug(
+                    log(
                         "Out of memory error occurred during the computation of "
                         f"the metric {metric.pretty_name}. Moving the computation to "
-                        "the CPU."
+                        "the CPU.",
+                        level=logging.DEBUG,
                     )
                 else:
                     raise InvalidBenchmark(str(e)) from e
             finally:
                 for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
                     if hasattr(metric, attribute):
-                        logger.debug(
+                        log(
                             f"Deleting the {attribute!r} attribute of the metric "
-                            f"{metric.pretty_name} to free up memory."
+                            f"{metric.pretty_name} to free up memory.",
+                            level=logging.DEBUG,
                         )
                         delattr(metric, attribute)
+        else:
+            raise InvalidBenchmark(
+                f"Could not compute the metric {metric.pretty_name} after "
+                f"{num_attempts} attempts due to out of memory errors."
+            )
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process
@@ -122,7 +132,7 @@ def compute_metrics(
 def extract_labels_from_generation(
     input_batch: dict[str, list], model_output: "GenerativeModelOutput"
-) -> list[t.Any]:
+) -> c.Sequence[t.Any]:
     """Extract the predicted labels from the generated output.
     Args:

euroeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Utility functions related to the token-classification task group."""
+import collections.abc as c
 import logging
 import typing as t
 from copy import deepcopy
@@ -7,6 +8,7 @@ from copy import deepcopy
 import numpy as np
 from ..exceptions import InvalidBenchmark
+from ..logging_utils import log
 from ..utils import (
     extract_json_dict_from_string,
     raise_if_model_output_contains_nan_values,
@@ -22,9 +24,6 @@ if t.TYPE_CHECKING:
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     has_misc_tags: bool,
@@ -61,7 +60,9 @@ def compute_metrics(
     predictions: list[list[str]]
     if not isinstance(model_outputs[0][0], str):
-        raw_predictions: list[list[int]] = np.argmax(model_outputs, axis=-1).tolist()
+        raw_predictions: c.Sequence[c.Sequence[int]] = np.argmax(
+            model_outputs, axis=-1
+        ).tolist()
         # Remove ignored index (special tokens)
         predictions = [
@@ -191,7 +192,7 @@ def extract_labels_from_generation(
     input_batch: dict[str, list],
     model_output: "GenerativeModelOutput",
     dataset_config: "DatasetConfig",
-) -> list[t.Any]:
+) -> c.Sequence[t.Any]:
     """Extract the predicted labels from the generated output.
     Args:
@@ -216,17 +217,19 @@ def extract_labels_from_generation(
         prompt_label_mapping = dataset_config.prompt_label_mapping
         for prompt_tag_name, named_entities in prediction_dict.items():
             if not isinstance(named_entities, list):
-                logger.debug(
+                log(
                     "The model produced an invalid format for the named entities. "
-                    f"Expected a list but got {type(named_entities)}. Skipping."
+                    f"Expected a list but got {type(named_entities)}. Skipping.",
+                    level=logging.DEBUG,
                 )
                 continue
             try:
                 named_entities = [str(ne) for ne in named_entities]
             except Exception:
-                logger.debug(
+                log(
                     "The model produced an invalid format for the named entities. "
-                    f"Expected a list of strings but got {named_entities}. Skipping."
+                    f"Expected a list of strings but got {named_entities}. Skipping.",
+                    level=logging.DEBUG,
                 )
                 continue
             try:
@@ -236,9 +239,10 @@ def extract_labels_from_generation(
                     if prompt_tag == prompt_tag_name
                 ][0]
             except IndexError:
-                logger.debug(
+                log(
                     "The model produced an invalid prompt tag name, "
-                    f"{prompt_tag_name}. Skipping."
+                    f"{prompt_tag_name}. Skipping.",
+                    level=logging.DEBUG,
                 )
                 continue
@@ -283,8 +287,8 @@ def tokenize_and_align_labels(
     # tokeniser is of a "fast" variant then this can be accessed through the
     # `word_ids` method. Otherwise, we have to extract it manually.
     all_labels: list[list[int]] = list()
-    labels: list[str]
-    word_ids: list[int | None]
+    labels: c.Sequence[str]
+    word_ids: c.Sequence[int | None]
     for i, labels in enumerate(examples["labels"]):
         # Try to get the word IDs from the tokeniser
         try:
@@ -294,10 +298,10 @@ def tokenize_and_align_labels(
         # IDs manually
         except ValueError:
             # Get the list of words in the document
-            words: list[str] = examples["tokens"][i]
+            words: c.Sequence[str] = examples["tokens"][i]
             # Get the list of token IDs in the document
-            tok_ids: list[int] = tokenized_inputs.input_ids[i]
+            tok_ids: c.Sequence[int] = tokenized_inputs.input_ids[i]
             # Decode the token IDs
             tokens = tokeniser.convert_ids_to_tokens(tok_ids)
@@ -390,8 +394,8 @@ def tokenize_and_align_labels(
 def handle_unk_tokens(
-    tokeniser: "PreTrainedTokenizer", tokens: list[str], words: list[str]
-) -> list[str]:
+    tokeniser: "PreTrainedTokenizer", tokens: list[str], words: c.Sequence[str]
+) -> c.Sequence[str]:
     """Replace unknown tokens in the tokens with the corresponding word.
     Args:

euroeval/tasks.py CHANGED Viewed

@@ -5,12 +5,14 @@ from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
 from .data_models import Task
 from .enums import GenerativeType, ModelType, TaskGroup
 from .prompt_templates import (
+    CLASSIFICATION_TEMPLATES,
     LA_TEMPLATES,
     MULTIPLE_CHOICE_TEMPLATES,
     NER_TEMPLATES,
     RC_TEMPLATES,
     SENT_TEMPLATES,
     SUMM_TEMPLATES,
+    TOKEN_CLASSIFICATION_TEMPLATES,
 )
@@ -20,7 +22,11 @@ def get_all_tasks() -> dict[str, Task]:
     Returns:
         A mapping between names of dataset tasks and their configurations.
     """
-    return {cfg.name: cfg for cfg in globals().values() if isinstance(cfg, Task)}
+    return {
+        cfg.name: cfg
+        for cfg in globals().values()
+        if isinstance(cfg, Task) and cfg != SPEED
+    }
 LA = Task(
@@ -159,3 +165,40 @@ SPEED = Task(
     default_max_generated_tokens=5,
     default_labels=[],
 )
+# Used for custom datasets
+TEXT_CLASSIFICATION = Task(
+    name="classification",
+    task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
+    template_dict=CLASSIFICATION_TEMPLATES,
+    metrics=[m.mcc_metric, m.macro_f1_metric],
+    default_num_few_shot_examples=12,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
+    default_labels=None,
+    uses_logprobs=True,
+)
+TOKEN_CLASSIFICATION = Task(
+    name="token-classification",
+    task_group=TaskGroup.TOKEN_CLASSIFICATION,
+    template_dict=TOKEN_CLASSIFICATION_TEMPLATES,
+    metrics=[m.micro_f1_metric],
+    default_num_few_shot_examples=8,
+    default_max_generated_tokens=128,
+    default_labels=None,
+    uses_structured_output=True,
+)
+MULTIPLE_CHOICE = Task(
+    name="multiple-choice",
+    task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
+    template_dict=MULTIPLE_CHOICE_TEMPLATES,
+    metrics=[m.mcc_metric, m.accuracy_metric],
+    default_num_few_shot_examples=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
+    default_labels=None,
+    default_allowed_model_types=[ModelType.GENERATIVE],
+    uses_logprobs=True,
+)

EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.3.0py3-none-any.whl → 16.5.0py3-none-any.whl