PyPI - EuroEval - Versions diffs - 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl - Mend

EuroEval 16.2.2py3-none-any.whl → 16.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (65) hide show

euroeval/__init__.py +7 -4
euroeval/benchmark_config_factory.py +0 -4
euroeval/benchmark_modules/base.py +3 -16
euroeval/benchmark_modules/fresh.py +5 -2
euroeval/benchmark_modules/hf.py +107 -66
euroeval/benchmark_modules/litellm.py +103 -55
euroeval/benchmark_modules/vllm.py +155 -82
euroeval/benchmarker.py +184 -129
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +1 -1
euroeval/constants.py +9 -0
euroeval/data_loading.py +14 -11
euroeval/data_models.py +12 -4
euroeval/dataset_configs/__init__.py +3 -0
euroeval/dataset_configs/czech.py +79 -0
euroeval/dataset_configs/danish.py +10 -13
euroeval/dataset_configs/dutch.py +0 -3
euroeval/dataset_configs/english.py +0 -3
euroeval/dataset_configs/estonian.py +11 -1
euroeval/dataset_configs/finnish.py +0 -3
euroeval/dataset_configs/french.py +0 -3
euroeval/dataset_configs/german.py +0 -3
euroeval/dataset_configs/italian.py +0 -3
euroeval/dataset_configs/latvian.py +2 -4
euroeval/dataset_configs/lithuanian.py +68 -0
euroeval/dataset_configs/norwegian.py +0 -3
euroeval/dataset_configs/polish.py +0 -3
euroeval/dataset_configs/portuguese.py +0 -3
euroeval/dataset_configs/slovak.py +60 -0
euroeval/dataset_configs/spanish.py +0 -3
euroeval/dataset_configs/swedish.py +10 -15
euroeval/finetuning.py +21 -15
euroeval/generation.py +10 -10
euroeval/generation_utils.py +2 -3
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +10 -6
euroeval/metrics/llm_as_a_judge.py +5 -3
euroeval/metrics/pipeline.py +22 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +11 -14
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/linguistic_acceptability.py +30 -3
euroeval/prompt_templates/multiple_choice.py +34 -1
euroeval/prompt_templates/named_entity_recognition.py +71 -11
euroeval/prompt_templates/reading_comprehension.py +41 -3
euroeval/prompt_templates/sentiment_classification.py +34 -1
euroeval/prompt_templates/summarization.py +26 -6
euroeval/scores.py +7 -7
euroeval/speed_benchmark.py +3 -5
euroeval/task_group_utils/multiple_choice_classification.py +0 -3
euroeval/task_group_utils/question_answering.py +0 -3
euroeval/task_group_utils/sequence_classification.py +43 -31
euroeval/task_group_utils/text_to_text.py +17 -8
euroeval/task_group_utils/token_classification.py +10 -9
euroeval/tokenisation_utils.py +22 -20
euroeval/utils.py +30 -147
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
euroeval-16.4.0.dist-info/RECORD +75 -0
euroeval-16.2.2.dist-info/RECORD +0 -70
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0

euroeval/scores.py CHANGED Viewed

@@ -6,12 +6,12 @@ import warnings
 import numpy as np
+from .logging_utils import log
 if t.TYPE_CHECKING:
     from .metrics import Metric
     from .types import ScoreDict
-logger = logging.getLogger("euroeval")
 def log_scores(
     dataset_name: str,
@@ -48,9 +48,8 @@ def log_scores(
     if model_param is not None:
         model_id += f"#{model_param}"
-    logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
     total_dict: dict[str, float] = dict()
+    all_log_strs: list[str] = [f"Finished benchmarking {model_id} on {dataset_name}."]
     for metric in metrics:
         test_score, test_se = aggregate_scores(scores=scores, metric=metric)
         test_score, test_score_str = metric.postprocessing_fn(test_score)
@@ -58,11 +57,12 @@ def log_scores(
         total_dict[f"test_{metric.name}"] = test_score
         total_dict[f"test_{metric.name}_se"] = test_se
         log_str = (
-            f"{metric.pretty_name}: {test_score_str} ± {test_se_str}"
+            f"- {metric.pretty_name}: {test_score_str} ± {test_se_str}"
             if not np.isnan(test_se)
-            else f"{metric.pretty_name}: {test_score_str}"
+            else f"- {metric.pretty_name}: {test_score_str}"
         )
-        logger.info(log_str)
+        all_log_strs.append(log_str)
+    log("\n".join(all_log_strs), level=logging.INFO)
     return dict(raw=scores, total=total_dict)

euroeval/speed_benchmark.py CHANGED Viewed

@@ -4,19 +4,17 @@ import logging
 import typing as t
 import pyinfer
-from tqdm.auto import tqdm
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from .benchmark_modules import HuggingFaceEncoderModel, LiteLLMModel, VLLMModel
 from .exceptions import InvalidBenchmark
+from .logging_utils import get_pbar, log
 from .utils import clear_memory
 if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
     from .data_models import BenchmarkConfig
-logger = logging.getLogger("euroeval")
 def benchmark_speed(
     model: "BenchmarkModule", benchmark_config: "BenchmarkConfig"
@@ -33,7 +31,7 @@ def benchmark_speed(
         Dictionary of scores.
     """
     scores: list[dict[str, float]] = list()
-    for idx in tqdm(
+    for idx in get_pbar(
         iterable=range(benchmark_config.num_iterations),
         desc="Benchmarking",
         disable=not benchmark_config.progress_bar,
@@ -41,7 +39,7 @@ def benchmark_speed(
         itr_scores = benchmark_speed_single_iteration(model=model, itr_idx=idx)
         clear_memory()
         scores.append(itr_scores)
-        logger.debug(f"Scores for iteration {idx}: {itr_scores}")
+        log(f"Scores for iteration {idx}: {itr_scores}", level=logging.DEBUG)
     return scores

euroeval/task_group_utils/multiple_choice_classification.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Utility functions related to the multiple-choice classification task group."""
 import hashlib
-import logging
 import re
 import typing as t
 from collections import defaultdict
@@ -18,8 +17,6 @@ if t.TYPE_CHECKING:
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 class MultipleChoiceClassificationTrainer(Trainer):
     """Trainer subclass for multiple-choice classification tasks."""

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Utility functions related to the question-answering task group."""
 import collections.abc as c
-import logging
 import typing as t
 from collections import defaultdict
@@ -26,8 +25,6 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 class QuestionAnsweringTrainer(Trainer):
     """Trainer subclass for question answering tasks."""

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -19,13 +19,15 @@ if t.TYPE_CHECKING:
     from datasets.arrow_dataset import Dataset
     from transformers.trainer_utils import EvalPrediction
-    from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
+    from ..data_models import (
+        BenchmarkConfig,
+        DatasetConfig,
+        GenerativeModelOutput,
+        ModelConfig,
+    )
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
@@ -106,6 +108,7 @@ def extract_labels_from_generation(
     input_batch: dict[str, list],
     model_output: "GenerativeModelOutput",
     dataset_config: "DatasetConfig",
+    model_config: "ModelConfig",
     first_label_token_mapping: dict[str, str] | bool,
 ) -> list[str]:
     """Extract the predicted labels from the generated output.
@@ -118,6 +121,8 @@ def extract_labels_from_generation(
             The raw generated output of the model.
         dataset_config:
             The configuration of the dataset.
+        model_config:
+            The configuration of the model.
         first_label_token_mapping:
             A mapping from labels to the first token in each label, or alternatively a
             Boolean value indicating whether the model should output scores (if the
@@ -167,6 +172,7 @@ def extract_labels_from_generation(
             )
     new_predicted_labels: list[str] = list()
+    num_predictions_being_very_off = 0
     for idx, predicted_label in enumerate(model_output.sequences):
         # If the prediction includes a boxed answer, use that instead of the full
         # generation
@@ -199,34 +205,40 @@ def extract_labels_from_generation(
         # word edit distance to the predicted label (if invalid model outputs are
         # allowed), or we raise an error
         if min(edit_distances) >= 1000:
-            if dataset_config.allow_invalid_model_outputs:
-                logger.warning(
-                    "No candidate labels found for the predicted label "
-                    f"{predicted_label!r}, out of the candidate labels "
-                    f"{sample_candidate_labels[idx]}. This likely means that the model "
-                    "output is completely off, but since invalid model outputs are "
-                    "allowed for this task, we will use the closest candidate label "
-                    f"({best_candidate_label})) as the output label. If you see this "
-                    "warning very often, please report this issue to the EuroEval "
-                    "team at github.com/EuroEval/EuroEval/issues."
-                )
-                logger.debug(
-                    "The candidate labels were extracted from the prompt: "
-                    f"{input_batch['text'][idx]!r}."
-                )
-            else:
-                raise InvalidBenchmark(
-                    "No candidate labels found for the predicted label "
-                    f"{predicted_label!r}, out of the candidate labels "
-                    f"{sample_candidate_labels[idx]}. This likely means that the model "
-                    "output is completely off, and we cannot extract any labels from "
-                    "it. Please check the model output and the candidate labels. The "
-                    "candidate labels were extracted from the prompt: "
-                    f"{input_batch['text'][idx]!r}."
-                )
+            num_predictions_being_very_off += 1
         new_predicted_labels.append(best_candidate_label)
+    if num_predictions_being_very_off > 0:
+        if dataset_config.allow_invalid_model_outputs:
+            log_msg = (
+                "No candidate labels found for the predicted label in "
+                f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
+                f"of the samples with the model {model_config.model_id!r}. This "
+                "likely means that the model were completely off in these cases, "
+                "but since invalid model outputs are allowed for this task, we used "
+                "the closest candidate labels as the output labels."
+            )
+            level = logging.DEBUG
+            if num_predictions_being_very_off / len(model_output.sequences) > 0.5:
+                log_msg += (
+                    " Since this happened for most of the model's predictions, please "
+                    "report this issue to the EuroEval team at "
+                    "github.com/EuroEval/EuroEval/issues."
+                )
+                level = logging.WARNING
+            log_once(log_msg, level=level)
+        else:
+            raise InvalidBenchmark(
+                "No candidate labels found for the predicted label in "
+                f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
+                "of the samples. This likely means that the model were completely "
+                "off in these cases. Since this task does not allow invalid model "
+                "outputs, we have to abort the evaluation. Please re-run the "
+                "evaluation with the `--debug` flag (or `debug=True` if you're using "
+                "the `Benchmarker` API) to see the precise model outputs."
+            )
     return new_predicted_labels
@@ -355,7 +367,7 @@ def get_closest_logprobs_labels(
                     "be determined. This means that using logprobs to extract the "
                     "labels is not reliable, and we will instead fall back to "
                     "extracting the labels using word edit distance.",
-                    level=logging.INFO,
+                    level=logging.DEBUG,
                 )
             else:
                 log_once(
@@ -363,7 +375,7 @@ def get_closest_logprobs_labels(
                     "means that using logprobs to extract the labels is not reliable, "
                     "and we will instead fall back to extracting the labels using "
                     "word edit distance.",
-                    level=logging.INFO,
+                    level=logging.DEBUG,
                 )
             return None

euroeval/task_group_utils/text_to_text.py CHANGED Viewed

@@ -7,6 +7,7 @@ import numpy as np
 from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
 from ..exceptions import InvalidBenchmark
+from ..logging_utils import log
 from ..metrics import HuggingFaceMetric
 from ..utils import raise_if_model_output_contains_nan_values
@@ -18,9 +19,6 @@ if t.TYPE_CHECKING:
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
@@ -44,6 +42,10 @@ def compute_metrics(
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
         values.
+    Raises:
+        InvalidBenchmark:
+            If the metric computation fails.
     """
     model_outputs, labels = model_outputs_and_labels
@@ -72,7 +74,7 @@ def compute_metrics(
         ):
             metric.compute_kwargs["device"] = benchmark_config.device.type
-        while True:
+        for _ in range(num_attempts := 5):
             try:
                 score: float | None = metric(
                     predictions=predictions,
@@ -96,21 +98,28 @@ def compute_metrics(
                     and metric.compute_kwargs.get("device", "cpu") != "cpu"
                 ):
                     metric.compute_kwargs["device"] = "cpu"
-                    logger.debug(
+                    log(
                         "Out of memory error occurred during the computation of "
                         f"the metric {metric.pretty_name}. Moving the computation to "
-                        "the CPU."
+                        "the CPU.",
+                        level=logging.DEBUG,
                     )
                 else:
                     raise InvalidBenchmark(str(e)) from e
             finally:
                 for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
                     if hasattr(metric, attribute):
-                        logger.debug(
+                        log(
                             f"Deleting the {attribute!r} attribute of the metric "
-                            f"{metric.pretty_name} to free up memory."
+                            f"{metric.pretty_name} to free up memory.",
+                            level=logging.DEBUG,
                         )
                         delattr(metric, attribute)
+        else:
+            raise InvalidBenchmark(
+                f"Could not compute the metric {metric.pretty_name} after "
+                f"{num_attempts} attempts due to out of memory errors."
+            )
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process

euroeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -7,6 +7,7 @@ from copy import deepcopy
 import numpy as np
 from ..exceptions import InvalidBenchmark
+from ..logging_utils import log
 from ..utils import (
     extract_json_dict_from_string,
     raise_if_model_output_contains_nan_values,
@@ -22,9 +23,6 @@ if t.TYPE_CHECKING:
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     has_misc_tags: bool,
@@ -216,17 +214,19 @@ def extract_labels_from_generation(
         prompt_label_mapping = dataset_config.prompt_label_mapping
         for prompt_tag_name, named_entities in prediction_dict.items():
             if not isinstance(named_entities, list):
-                logger.debug(
+                log(
                     "The model produced an invalid format for the named entities. "
-                    f"Expected a list but got {type(named_entities)}. Skipping."
+                    f"Expected a list but got {type(named_entities)}. Skipping.",
+                    level=logging.DEBUG,
                 )
                 continue
             try:
                 named_entities = [str(ne) for ne in named_entities]
             except Exception:
-                logger.debug(
+                log(
                     "The model produced an invalid format for the named entities. "
-                    f"Expected a list of strings but got {named_entities}. Skipping."
+                    f"Expected a list of strings but got {named_entities}. Skipping.",
+                    level=logging.DEBUG,
                 )
                 continue
             try:
@@ -236,9 +236,10 @@ def extract_labels_from_generation(
                     if prompt_tag == prompt_tag_name
                 ][0]
             except IndexError:
-                logger.debug(
+                log(
                     "The model produced an invalid prompt tag name, "
-                    f"{prompt_tag_name}. Skipping."
+                    f"{prompt_tag_name}. Skipping.",
+                    level=logging.DEBUG,
                 )
                 continue

euroeval/tokenisation_utils.py CHANGED Viewed

@@ -5,11 +5,11 @@ import re
 import typing as t
 import torch
-from transformers import MistralCommonTokenizer
+from transformers.tokenization_mistral_common import MistralCommonTokenizer
 from .enums import GenerativeType
 from .exceptions import InvalidModel
-from .utils import log_once
+from .logging_utils import log, log_once
 if t.TYPE_CHECKING:
     from transformers.tokenization_utils import PreTrainedTokenizer
@@ -18,9 +18,6 @@ if t.TYPE_CHECKING:
     from .data_models import DatasetConfig, ModelConfig
-logger = logging.getLogger("euroeval")
 def get_special_token_metadata(tokeniser: "PreTrainedTokenizerBase") -> dict:
     """Get the special token metadata for a tokeniser.
@@ -182,7 +179,7 @@ def get_bos_token(
             "The model does not have a beginning-of-sequence token. Please ensure that "
             "this has been set in the tokeniser's configuration. Using no BOS token."
             " This may lead to unexpected behavior in the model.",
-            level=logging.INFO,
+            level=logging.WARNING,
         )
         return None, None
@@ -223,14 +220,14 @@ def get_eos_token(
             "The model does not have an end-of-sequence token. Please ensure that this "
             "has been set in the tokeniser's configuration. Using no EOS token. This "
             "may lead to unexpected behavior in the model.",
-            level=logging.INFO,
+            level=logging.WARNING,
         )
         return None, None
     log_once(
         f"End-of-sequence token was not set, but detected it as {eos_token!r} with "
         f"ID {eos_token_id}.",
-        level=logging.DEBUG,
+        level=logging.WARNING,
     )
     return eos_token, eos_token_id
@@ -306,7 +303,7 @@ def get_pad_token(
                 "Could not identify a padding token for the model. Please ensure that "
                 "this has been set in the tokeniser's configuration. Using no padding "
                 "token. This may lead to unexpected behavior in the model.",
-                level=logging.INFO,
+                level=logging.WARNING,
             )
             return None, None
@@ -358,12 +355,16 @@ def get_end_of_chat_token_ids(
             x_token_index = idx
             break
     else:
-        logger.debug("Could not locate the end-of-chat token for the model.")
+        log(
+            "Could not locate the end-of-chat token for the model.", level=logging.DEBUG
+        )
         return None
     end_of_chat_tokens = token_ids[x_token_index + 1 :]
     if len(end_of_chat_tokens) == 0:
-        logger.debug("Could not locate the end-of-chat token for the model.")
+        log(
+            "Could not locate the end-of-chat token for the model.", level=logging.DEBUG
+        )
         return None
     log_once(
@@ -506,7 +507,8 @@ def get_first_label_token_mapping(
             log_once(
                 "We will not use logprobs with the model since the first tokens of the "
                 "labels are not distinct. The first tokens for the labels "
-                f"{local_labels} are {first_tokens}"
+                f"{local_labels} are {first_tokens}",
+                level=logging.DEBUG,
             )
         return False
@@ -521,7 +523,14 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
     Returns:
         Whether the tokeniser has a chat template.
     """
-    if hasattr(tokeniser, "chat_template"):
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        log_once(
+            "The tokeniser is a Mistral tokeniser, so assuming that the model is "
+            "instruction tuned.",
+            level=logging.DEBUG,
+        )
+        return True
+    elif hasattr(tokeniser, "chat_template"):
         has_template = tokeniser.chat_template is not None
         if has_template:
             log_once(
@@ -530,13 +539,6 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
                 level=logging.DEBUG,
             )
         return has_template
-    elif isinstance(tokeniser, MistralCommonTokenizer):
-        log_once(
-            "The tokeniser is a Mistral tokeniser, so assuming that the model is "
-            "instruction tuned.",
-            level=logging.DEBUG,
-        )
-        return True
     else:
         log_once(
             "We cannot find a chat template for the tokeniser, so assuming that the "

EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.2.2py3-none-any.whl → 16.4.0py3-none-any.whl