PyPI - EuroEval - Versions diffs - 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl - Mend

EuroEval 16.3.0py3-none-any.whl → 16.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (78) hide show

euroeval/__init__.py +9 -2
euroeval/benchmark_config_factory.py +51 -50
euroeval/benchmark_modules/base.py +9 -21
euroeval/benchmark_modules/fresh.py +2 -1
euroeval/benchmark_modules/hf.py +101 -71
euroeval/benchmark_modules/litellm.py +115 -53
euroeval/benchmark_modules/vllm.py +107 -92
euroeval/benchmarker.py +144 -121
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +86 -8
euroeval/constants.py +9 -0
euroeval/data_loading.py +80 -29
euroeval/data_models.py +338 -330
euroeval/dataset_configs/__init__.py +12 -3
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +55 -93
euroeval/dataset_configs/dutch.py +48 -87
euroeval/dataset_configs/english.py +45 -77
euroeval/dataset_configs/estonian.py +42 -34
euroeval/dataset_configs/faroese.py +19 -60
euroeval/dataset_configs/finnish.py +36 -69
euroeval/dataset_configs/french.py +39 -75
euroeval/dataset_configs/german.py +45 -82
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +54 -91
euroeval/dataset_configs/italian.py +42 -79
euroeval/dataset_configs/latvian.py +28 -35
euroeval/dataset_configs/lithuanian.py +28 -26
euroeval/dataset_configs/norwegian.py +72 -115
euroeval/dataset_configs/polish.py +33 -61
euroeval/dataset_configs/portuguese.py +33 -66
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/spanish.py +42 -77
euroeval/dataset_configs/swedish.py +52 -90
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/exceptions.py +1 -1
euroeval/finetuning.py +24 -17
euroeval/generation.py +15 -14
euroeval/generation_utils.py +8 -8
euroeval/languages.py +395 -323
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +21 -6
euroeval/metrics/llm_as_a_judge.py +6 -4
euroeval/metrics/pipeline.py +17 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +17 -19
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +99 -42
euroeval/prompt_templates/multiple_choice.py +102 -38
euroeval/prompt_templates/named_entity_recognition.py +172 -51
euroeval/prompt_templates/reading_comprehension.py +119 -42
euroeval/prompt_templates/sentiment_classification.py +110 -40
euroeval/prompt_templates/summarization.py +85 -40
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +11 -10
euroeval/speed_benchmark.py +5 -6
euroeval/task_group_utils/multiple_choice_classification.py +2 -4
euroeval/task_group_utils/question_answering.py +24 -16
euroeval/task_group_utils/sequence_classification.py +48 -35
euroeval/task_group_utils/text_to_text.py +19 -9
euroeval/task_group_utils/token_classification.py +21 -17
euroeval/tasks.py +44 -1
euroeval/tokenisation_utils.py +33 -22
euroeval/types.py +10 -9
euroeval/utils.py +35 -149
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
euroeval-16.5.0.dist-info/RECORD +81 -0
euroeval-16.3.0.dist-info/RECORD +0 -71
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmarker.py CHANGED Viewed

@@ -1,10 +1,12 @@
 """Class that benchmarks language models."""
+import collections.abc as c
 import contextlib
+import datetime as dt
 import json
 import logging
+import os
 import re
-import sys
 import typing as t
 from pathlib import Path
 from shutil import rmtree
@@ -12,7 +14,6 @@ from time import sleep
 from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
 from torch.distributed import destroy_process_group
-from tqdm.auto import tqdm
 from .benchmark_config_factory import build_benchmark_config
 from .constants import GENERATIVE_PIPELINE_TAGS
@@ -23,6 +24,7 @@ from .enums import Device, GenerativeType, ModelType
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
 from .finetuning import finetune
 from .generation import generate
+from .logging_utils import adjust_logging_level, get_pbar, log, log_once
 from .model_config import get_model_config
 from .model_loading import load_model
 from .scores import log_scores
@@ -32,16 +34,12 @@ from .utils import (
     enforce_reproducibility,
     get_package_version,
     internet_connection_available,
-    log_once,
     split_model_id,
 )
 if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
-    from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
-logger = logging.getLogger("euroeval")
+    from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig, Task
 class Benchmarker:
@@ -65,11 +63,11 @@ class Benchmarker:
         self,
         progress_bar: bool = True,
         save_results: bool = True,
-        task: str | list[str] | None = None,
-        dataset: list[str] | str | None = None,
-        language: str | list[str] = "all",
-        model_language: str | list[str] | None = None,
-        dataset_language: str | list[str] | None = None,
+        task: "str | Task | c.Sequence[str | Task] | None" = None,
+        dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
+        language: str | c.Sequence[str] = "all",
+        model_language: str | c.Sequence[str] | None = None,
+        dataset_language: str | c.Sequence[str] | None = None,
         device: Device | None = None,
         batch_size: int = 32,
         raise_errors: bool = False,
@@ -179,6 +177,8 @@ class Benchmarker:
             ValueError:
                 If both `task` and `dataset` are specified, or if `download_only`
                 is True and we have no internet connection.
+            ImportError:
+                If `hf_transfer` is enabled but not installed.
         """
         if task is not None and dataset is not None:
             raise ValueError("Only one of `task` and `dataset` can be specified.")
@@ -200,6 +200,10 @@ class Benchmarker:
                 "Try installing it with `pip install hf_transfer`."
             )
+        # If FULL_LOG has been set, then force verbose mode
+        if os.getenv("FULL_LOG", "0") == "1":
+            verbose = True
         self.benchmark_config_default_params = BenchmarkConfigParams(
             task=task,
             dataset=dataset,
@@ -235,13 +239,13 @@ class Benchmarker:
         )
         # Initialise variable storing model lists, so we only have to fetch it once
-        self._model_lists: dict[str, list[str]] | None = None
+        self._model_lists: dict[str, c.Sequence[str]] | None = None
         self.results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
         adjust_logging_level(verbose=self.benchmark_config.verbose)
     @property
-    def benchmark_results(self) -> list[BenchmarkResult]:
+    def benchmark_results(self) -> c.Sequence[BenchmarkResult]:
         """The benchmark results.
         Returns:
@@ -301,7 +305,6 @@ class Benchmarker:
         )
         del dataset
-        log_once(f"Loading model {model_config.model_id}", level=logging.INFO)
         model = load_model(
             model_config=model_config,
             dataset_config=dataset_config,
@@ -320,14 +323,14 @@ class Benchmarker:
     def benchmark(
         self,
-        model: list[str] | str,
-        task: str | list[str] | None = None,
-        dataset: list[str] | str | None = None,
+        model: c.Sequence[str] | str,
+        task: "str | Task | c.Sequence[str | Task] | None" = None,
+        dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
         progress_bar: bool | None = None,
         save_results: bool | None = None,
-        language: str | list[str] | None = None,
-        model_language: str | list[str] | None = None,
-        dataset_language: str | list[str] | None = None,
+        language: str | c.Sequence[str] | None = None,
+        model_language: str | c.Sequence[str] | None = None,
+        dataset_language: str | c.Sequence[str] | None = None,
         device: Device | None = None,
         batch_size: int | None = None,
         raise_errors: bool | None = None,
@@ -347,7 +350,7 @@ class Benchmarker:
         force: bool | None = None,
         verbose: bool | None = None,
         debug: bool | None = None,
-    ) -> list[BenchmarkResult]:
+    ) -> c.Sequence[BenchmarkResult]:
         """Benchmarks models on datasets.
         Args:
@@ -605,13 +608,11 @@ class Benchmarker:
             clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
         model_ids = self._prepare_model_ids(model_id=model)
-        dataset_configs = prepare_dataset_configs(
-            dataset_names=benchmark_config.datasets
-        )
+        dataset_configs = benchmark_config.datasets
         # Get all the model configs
         model_configs: list[ModelConfig] = list()
-        for model_id in tqdm(
+        for model_id in get_pbar(
             iterable=model_ids,
             desc="Fetching model configurations",
             disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
@@ -622,50 +623,63 @@ class Benchmarker:
                 )
                 model_configs.append(model_config)
             except InvalidModel as e:
-                logger.info(e.message)
+                log(e.message, level=logging.ERROR)
         # Create a dictionary that takes each model config to the dataset configs that
-        # we need to benchmark the model on. Here we remove the datasets that the model
-        # has already been benchmarked on, or datasets that the model cannot be
-        # benchmarked on.
-        model_config_to_dataset_configs: dict[ModelConfig, list[DatasetConfig]] = {
+        # we need to benchmark the model on. We initially include all the relevant
+        # datasets for each model.
+        model_config_to_dataset_configs: dict[
+            ModelConfig, c.Sequence[DatasetConfig]
+        ] = {
             model_config: [
                 dataset_config
                 for dataset_config in dataset_configs
-                if (
-                    benchmark_config.force
-                    or not model_has_been_benchmarked(
-                        model_config=model_config,
-                        dataset_config=dataset_config,
-                        benchmark_config=benchmark_config,
-                        benchmark_results=self.benchmark_results,
-                    )
-                )
-                and model_config.model_type in dataset_config.allowed_model_types
+                if model_config.model_type in dataset_config.allowed_model_types
             ]
             for model_config in model_configs
         }
+        # Initialise the current benchmark results with all the ones that we have cached
+        # on disk already (can be none), and remove those datasets from the mapping
+        current_benchmark_results: list[BenchmarkResult] = list()
+        for (
+            model_config,
+            model_dataset_configs,
+        ) in model_config_to_dataset_configs.items():
+            new_model_dataset_configs: list[DatasetConfig] = list()
+            for dataset_config in model_dataset_configs:
+                benchmark_record = get_record(
+                    model_config=model_config,
+                    dataset_config=dataset_config,
+                    benchmark_config=benchmark_config,
+                    benchmark_results=self.benchmark_results,
+                )
+                if benchmark_record is not None and not benchmark_config.force:
+                    current_benchmark_results.append(benchmark_record)
+                else:
+                    new_model_dataset_configs.append(dataset_config)
+            model_config_to_dataset_configs[model_config] = new_model_dataset_configs
         total_benchmarks = sum(
             len(dataset_configs)
             for dataset_configs in model_config_to_dataset_configs.values()
         )
         if total_benchmarks == 0:
-            logger.info(
+            log(
                 "No benchmarks to run, as all the selected models have already been "
-                "benchmarked on all the selected datasets."
+                "benchmarked on all the selected datasets.",
+                level=logging.INFO,
             )
-            return list()
-        logger.info(f"Initiated evaluation of {total_benchmarks:,} benchmarks.")
+            return current_benchmark_results
         num_finished_benchmarks = 0
-        current_benchmark_results: list[BenchmarkResult] = list()
+        benchmark_params_to_revert: dict[str, t.Any] = dict()
         for model_config in model_configs:
             if not model_config_to_dataset_configs[model_config]:
-                logger.debug(
+                log(
                     f"Skipping model {model_config.model_id!r} because it has "
-                    "already been benchmarked on all valid datasets."
+                    "already been benchmarked on all valid datasets.",
+                    level=logging.DEBUG,
                 )
                 continue
@@ -691,7 +705,6 @@ class Benchmarker:
                     )
             loaded_model: BenchmarkModule | None = None
-            benchmark_params_to_revert: dict[str, t.Any] = dict()
             for dataset_config in model_config_to_dataset_configs[model_config]:
                 # Revert any changes to the benchmark configuration made for the
                 # previous dataset
@@ -704,18 +717,20 @@ class Benchmarker:
                     "val" not in dataset_config.splits
                     and not benchmark_config.evaluate_test_split
                 ):
-                    logger.debug(
+                    log(
                         "The dataset does not have a validation split, so even though "
                         "you requested evaluating the validation split (the default), "
-                        "we will evaluate on the test split."
+                        "we will evaluate on the test split.",
+                        level=logging.DEBUG,
                     )
                     benchmark_params_to_revert["evaluate_test_split"] = False
                     benchmark_config.evaluate_test_split = True
                 if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
-                    logger.debug(
+                    log(
                         "The task requires zero-shot evaluation, so even though you "
                         "requested few-shot evaluation (the default), we will evaluate "
-                        "zero-shot."
+                        "zero-shot.",
+                        level=logging.DEBUG,
                     )
                     benchmark_params_to_revert["few_shot"] = True
                     benchmark_config.few_shot = False
@@ -723,13 +738,7 @@ class Benchmarker:
                 # We do not re-initialise generative models as their architecture is not
                 # customised to specific datasets
                 if model_config.model_type == ModelType.GENERATIVE:
-                    initial_logging(
-                        model_config=model_config,
-                        dataset_config=dataset_config,
-                        benchmark_config=benchmark_config,
-                    )
                     if loaded_model is None:
-                        logger.info("Loading model...")
                         try:
                             loaded_model = load_model(
                                 model_config=model_config,
@@ -739,7 +748,7 @@ class Benchmarker:
                         except InvalidModel as e:
                             if benchmark_config.raise_errors:
                                 raise e
-                            logger.info(e.message)
+                            log(e.message, level=logging.ERROR)
                             # Add the remaining number of benchmarks for the model to
                             # our benchmark counter, since we're skipping the rest of
@@ -759,12 +768,13 @@ class Benchmarker:
                         loaded_model.generative_type
                         not in dataset_config.allowed_generative_types
                     ):
-                        logger.debug(
+                        log(
                             f"Skipping the benchmark of model "
                             f"{model_config.model_id!r}on dataset "
                             f"{dataset_config.name!r} because the model has generative "
                             f"type {loaded_model.generative_type} and the dataset "
-                            f"only allows {dataset_config.allowed_generative_types}."
+                            f"only allows {dataset_config.allowed_generative_types}.",
+                            level=logging.DEBUG,
                         )
                         num_finished_benchmarks += 1
                         continue
@@ -775,6 +785,8 @@ class Benchmarker:
                     model_config=model_config,
                     dataset_config=dataset_config,
                     benchmark_config=benchmark_config,
+                    num_finished_benchmarks=num_finished_benchmarks,
+                    num_total_benchmarks=total_benchmarks,
                 )
                 if (
@@ -784,12 +796,12 @@ class Benchmarker:
                     raise benchmark_output_or_err
                 elif isinstance(benchmark_output_or_err, InvalidBenchmark):
-                    logger.info(benchmark_output_or_err.message)
+                    log(benchmark_output_or_err.message, level=logging.WARNING)
                     num_finished_benchmarks += 1
                     continue
                 elif isinstance(benchmark_output_or_err, InvalidModel):
-                    logger.info(benchmark_output_or_err.message)
+                    log(benchmark_output_or_err.message, level=logging.WARNING)
                     # Add the remaining number of benchmarks for the model to our
                     # benchmark counter, since we're skipping the rest of them
@@ -805,15 +817,15 @@ class Benchmarker:
                         record.append_to_results(results_path=self.results_path)
                 num_finished_benchmarks += 1
-                logger.info(
-                    f"Finished {num_finished_benchmarks} out of "
-                    f"{total_benchmarks} benchmarks."
-                )
             del loaded_model
             if benchmark_config.clear_model_cache:
                 clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
+        log(
+            f"\nCompleted {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO
+        )
         # This avoids the following warning at the end of the benchmarking:
         #   Warning: WARNING: process group has NOT been destroyed before we destruct
         #   ProcessGroupNCCL. On normal program exit, the application should call
@@ -826,7 +838,7 @@ class Benchmarker:
             destroy_process_group()
         return current_benchmark_results
-    def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
+    def _prepare_model_ids(self, model_id: c.Sequence[str] | str) -> c.Sequence[str]:
         """Prepare the model ID(s) to be benchmarked.
         Args:
@@ -857,6 +869,8 @@ class Benchmarker:
         model_config: "ModelConfig",
         dataset_config: "DatasetConfig",
         benchmark_config: "BenchmarkConfig",
+        num_finished_benchmarks: int,
+        num_total_benchmarks: int,
     ) -> BenchmarkResult | InvalidBenchmark | InvalidModel:
         """Benchmark a single model on a single dataset.
@@ -869,25 +883,29 @@ class Benchmarker:
                 The configuration of the dataset we are evaluating on.
             benchmark_config:
                 The general benchmark configuration.
+            num_finished_benchmarks:
+                The number of benchmarks that have already been completed.
+            num_total_benchmarks:
+                The total number of benchmarks to be completed.
         Returns:
             The benchmark result, or an error if the benchmark was unsuccessful.
-        """
-        if model is None:
-            initial_logging(
-                model_config=model_config,
-                dataset_config=dataset_config,
-                benchmark_config=benchmark_config,
-            )
-        while True:
+        Raises:
+            RuntimeError:
+                If the MPS fallback is not enabled when required.
+            InvalidBenchmark:
+                If the benchmark was unsuccessful.
+            InvalidModel:
+                If the model is invalid.
+        """
+        for _ in range(num_attempts := 5):
             try:
                 # Set random seeds to enforce reproducibility of the randomly
                 # initialised weights
                 rng = enforce_reproducibility()
                 if model is None or model_config.model_type != ModelType.GENERATIVE:
-                    logger.info("Loading model...")
                     model = load_model(
                         model_config=model_config,
                         dataset_config=dataset_config,
@@ -895,6 +913,14 @@ class Benchmarker:
                     )
                 assert model is not None
+                initial_logging(
+                    model_config=model_config,
+                    dataset_config=dataset_config,
+                    benchmark_config=benchmark_config,
+                    num_finished_benchmarks=num_finished_benchmarks,
+                    num_total_benchmarks=num_total_benchmarks,
+                )
                 if dataset_config.task == SPEED:
                     scores = benchmark_speed(
                         model=model, benchmark_config=benchmark_config
@@ -962,14 +988,15 @@ class Benchmarker:
                     few_shot=benchmark_config.few_shot,
                     validation_split=not benchmark_config.evaluate_test_split,
                 )
-                logger.debug(f"Results:\n{results}")
+                log(f"Results:\n{results}", level=logging.DEBUG)
                 return record
             except HuggingFaceHubDown:
                 wait_time = 30
-                logger.debug(
+                log(
                     f"The Hugging Face Hub seems to be down. Retrying in {wait_time} "
-                    "seconds."
+                    "seconds.",
+                    level=logging.DEBUG,
                 )
                 sleep(wait_time)
                 continue
@@ -992,23 +1019,29 @@ class Benchmarker:
                 elif benchmark_config.raise_errors:
                     raise e
                 return e
+        else:
+            return InvalidBenchmark(
+                f"Failed to benchmark model {model_config.model_id!r} on dataset "
+                f"{dataset_config.name!r} after {num_attempts} attempts."
+            )
     def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any:  # noqa: ANN401
         """Alias for `self.benchmark()`."""
-        logger.warning(
+        log(
             "Calling the `Benchmarker` class directly is deprecated. Please use the "
-            "`benchmark` function instead. This will be removed in a future version."
+            "`benchmark` function instead. This will be removed in a future version.",
+            level=logging.WARNING,
         )
         return self.benchmark(*args, **kwds)
-def model_has_been_benchmarked(
+def get_record(
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
-    benchmark_results: list[BenchmarkResult],
-) -> bool:
-    """Checks whether a model has already been benchmarked on a dataset.
+    benchmark_results: c.Sequence[BenchmarkResult],
+) -> BenchmarkResult | None:
+    """Get the benchmark record for a given model and dataset.
     Args:
         model_config:
@@ -1021,7 +1054,7 @@ def model_has_been_benchmarked(
             The benchmark results.
     Returns:
-        Whether the model has already been evaluated on the dataset.
+        The benchmark record, or None if no such record exists.
     """
     for record in benchmark_results:
         model_id_components = split_model_id(model_id=record.model)
@@ -1046,30 +1079,8 @@ def model_has_been_benchmarked(
             and same_split
             and same_num_shots
         ):
-            return True
-    return False
-def adjust_logging_level(verbose: bool, ignore_testing: bool = False) -> int:
-    """Adjust the logging level based on verbosity.
-    Args:
-        verbose:
-            Whether to output additional output.
-        ignore_testing:
-            Whether to ignore the testing flag.
-    Returns:
-        The logging level that was set.
-    """
-    if hasattr(sys, "_called_from_test") and not ignore_testing:
-        logging_level = logging.CRITICAL
-    elif verbose:
-        logging_level = logging.DEBUG
-    else:
-        logging_level = logging.INFO
-    logger.setLevel(logging_level)
-    return logging_level
+            return record
+    return None
 def clear_model_cache_fn(cache_dir: str) -> None:
@@ -1090,7 +1101,9 @@ def clear_model_cache_fn(cache_dir: str) -> None:
                     rmtree(sub_model_dir)
-def prepare_dataset_configs(dataset_names: list[str]) -> list["DatasetConfig"]:
+def prepare_dataset_configs(
+    dataset_names: c.Sequence[str],
+) -> c.Sequence["DatasetConfig"]:
     """Prepare the dataset configuration(s) to be benchmarked.
     Args:
@@ -1109,6 +1122,8 @@ def initial_logging(
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
+    num_finished_benchmarks: int,
+    num_total_benchmarks: int,
 ) -> None:
     """Initial logging at the start of the benchmarking process.
@@ -1119,6 +1134,10 @@ def initial_logging(
             The configuration of the dataset we are evaluating on.
         benchmark_config:
             The general benchmark configuration.
+        num_finished_benchmarks:
+            The number of benchmarks that have already been finished.
+        num_total_benchmarks:
+            The total number of benchmarks to be run.
     """
     model_id = model_config.model_id
     if model_config.revision and model_config.revision != "main":
@@ -1135,21 +1154,25 @@ def initial_logging(
     else:
         eval_type = "Benchmarking"
-    logger.info(
-        f"{eval_type} {model_id} on the {split_type} split of "
-        f"{dataset_config.pretty_name}"
+    log_once(
+        f"\n{eval_type} {model_id} on the {split_type} split of "
+        f"{dataset_config.pretty_name} ({num_finished_benchmarks + 1}/"
+        f"{num_total_benchmarks} benchmarks)...",
+        prefix=f"\n[{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]",
     )
     if dataset_config.unofficial:
-        logger.info(
+        log_once(
             f"Note that the {dataset_config.name!r} dataset is unofficial, "
             "meaning that the resulting evaluation will not be included in the "
-            "official leaderboard."
+            "official leaderboard.",
+            level=logging.WARNING,
         )
     if benchmark_config.debug:
-        logger.info(
+        log_once(
             "Running in debug mode. This will output additional information, as "
             "well as store the model outputs in the current directory after each "
-            "batch. For this reason, evaluation will be slower."
+            "batch. For this reason, evaluation will be slower.",
+            level=logging.WARNING,
         )

euroeval/caching_utils.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Caching utility functions."""
+import typing as t
+from functools import wraps
+from .constants import T
+def cache_arguments(
+    *arguments: str, disable_condition: t.Callable[[], bool] = lambda: False
+) -> t.Callable[[t.Callable[..., T]], t.Callable[..., T]]:
+    """Cache specified arguments of a function.
+    Args:
+        arguments:
+            The list of argument names to cache. If empty, all arguments are cached.
+        disable_condition:
+            A function that checks if cache should be disabled.
+    Returns:
+        A decorator that caches the specified arguments of a function.
+    """
+    def caching_decorator(func: t.Callable[..., T]) -> t.Callable[..., T]:
+        """Decorator that caches the specified arguments of a function.
+        Args:
+            func:
+                The function to decorate.
+        Returns:
+            The decorated function.
+        """
+        cache: dict[tuple, T] = dict()
+        @wraps(func)
+        def wrapper(*args, **kwargs) -> T:
+            """Wrapper function that caches the specified arguments.
+            Args:
+                *args:
+                    The positional arguments to the function.
+                **kwargs:
+                    The keyword arguments to the function.
+            Returns:
+                The result of the function.
+            Raises:
+                ValueError:
+                    If an argument name is not found in the function parameters.
+            """
+            if not arguments:
+                key = args + tuple(kwargs[k] for k in sorted(kwargs.keys()))
+            else:
+                func_params = func.__code__.co_varnames
+                key_items: list[t.Any] = list()
+                for arg_name in arguments:
+                    if arg_name in kwargs:
+                        key_items.append(kwargs[arg_name])
+                    else:
+                        try:
+                            arg_index = func_params.index(arg_name)
+                            key_items.append(args[arg_index])
+                        except (ValueError, IndexError):
+                            raise ValueError(
+                                f"Argument {arg_name} not found in function "
+                                f"{func.__name__} parameters."
+                            )
+                key = tuple(key_items)
+            # Do not cache if the condition is met
+            if key not in cache or disable_condition():
+                cache[key] = func(*args, **kwargs)
+            return cache[key]
+        return wrapper
+    return caching_decorator

EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.3.0py3-none-any.whl → 16.5.0py3-none-any.whl