PyPI - EuroEval - Versions diffs - 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl - Mend

EuroEval 16.2.2py3-none-any.whl → 16.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (65) hide show

euroeval/__init__.py +7 -4
euroeval/benchmark_config_factory.py +0 -4
euroeval/benchmark_modules/base.py +3 -16
euroeval/benchmark_modules/fresh.py +5 -2
euroeval/benchmark_modules/hf.py +107 -66
euroeval/benchmark_modules/litellm.py +103 -55
euroeval/benchmark_modules/vllm.py +155 -82
euroeval/benchmarker.py +184 -129
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +1 -1
euroeval/constants.py +9 -0
euroeval/data_loading.py +14 -11
euroeval/data_models.py +12 -4
euroeval/dataset_configs/__init__.py +3 -0
euroeval/dataset_configs/czech.py +79 -0
euroeval/dataset_configs/danish.py +10 -13
euroeval/dataset_configs/dutch.py +0 -3
euroeval/dataset_configs/english.py +0 -3
euroeval/dataset_configs/estonian.py +11 -1
euroeval/dataset_configs/finnish.py +0 -3
euroeval/dataset_configs/french.py +0 -3
euroeval/dataset_configs/german.py +0 -3
euroeval/dataset_configs/italian.py +0 -3
euroeval/dataset_configs/latvian.py +2 -4
euroeval/dataset_configs/lithuanian.py +68 -0
euroeval/dataset_configs/norwegian.py +0 -3
euroeval/dataset_configs/polish.py +0 -3
euroeval/dataset_configs/portuguese.py +0 -3
euroeval/dataset_configs/slovak.py +60 -0
euroeval/dataset_configs/spanish.py +0 -3
euroeval/dataset_configs/swedish.py +10 -15
euroeval/finetuning.py +21 -15
euroeval/generation.py +10 -10
euroeval/generation_utils.py +2 -3
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +10 -6
euroeval/metrics/llm_as_a_judge.py +5 -3
euroeval/metrics/pipeline.py +22 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +11 -14
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/linguistic_acceptability.py +30 -3
euroeval/prompt_templates/multiple_choice.py +34 -1
euroeval/prompt_templates/named_entity_recognition.py +71 -11
euroeval/prompt_templates/reading_comprehension.py +41 -3
euroeval/prompt_templates/sentiment_classification.py +34 -1
euroeval/prompt_templates/summarization.py +26 -6
euroeval/scores.py +7 -7
euroeval/speed_benchmark.py +3 -5
euroeval/task_group_utils/multiple_choice_classification.py +0 -3
euroeval/task_group_utils/question_answering.py +0 -3
euroeval/task_group_utils/sequence_classification.py +43 -31
euroeval/task_group_utils/text_to_text.py +17 -8
euroeval/task_group_utils/token_classification.py +10 -9
euroeval/tokenisation_utils.py +22 -20
euroeval/utils.py +30 -147
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
euroeval-16.4.0.dist-info/RECORD +75 -0
euroeval-16.2.2.dist-info/RECORD +0 -70
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmarker.py CHANGED Viewed

@@ -1,10 +1,11 @@
 """Class that benchmarks language models."""
 import contextlib
+import datetime as dt
 import json
 import logging
+import os
 import re
-import sys
 import typing as t
 from pathlib import Path
 from shutil import rmtree
@@ -22,6 +23,7 @@ from .enums import Device, GenerativeType, ModelType
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
 from .finetuning import finetune
 from .generation import generate
+from .logging_utils import adjust_logging_level, get_pbar, log, log_once
 from .model_config import get_model_config
 from .model_loading import load_model
 from .scores import log_scores
@@ -31,7 +33,7 @@ from .utils import (
     enforce_reproducibility,
     get_package_version,
     internet_connection_available,
-    log_once,
+    split_model_id,
 )
 if t.TYPE_CHECKING:
@@ -39,9 +41,6 @@ if t.TYPE_CHECKING:
     from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
-logger = logging.getLogger("euroeval")
 class Benchmarker:
     """Benchmarking all the language models.
@@ -82,7 +81,7 @@ class Benchmarker:
         num_iterations: int = 10,
         api_base: str | None = None,
         api_version: str | None = None,
-        gpu_memory_utilization: float = 0.9,
+        gpu_memory_utilization: float = 0.8,
         generative_type: GenerativeType | None = None,
         debug: bool = False,
         run_with_cli: bool = False,
@@ -198,6 +197,10 @@ class Benchmarker:
                 "Try installing it with `pip install hf_transfer`."
             )
+        # If FULL_LOG has been set, then force verbose mode
+        if os.getenv("FULL_LOG", "0") == "1":
+            verbose = True
         self.benchmark_config_default_params = BenchmarkConfigParams(
             task=task,
             dataset=dataset,
@@ -299,7 +302,6 @@ class Benchmarker:
         )
         del dataset
-        log_once(f"Loading model {model_config.model_id}", level=logging.INFO)
         model = load_model(
             model_config=model_config,
             dataset_config=dataset_config,
@@ -607,46 +609,90 @@ class Benchmarker:
             dataset_names=benchmark_config.datasets
         )
-        total_benchmarks = len(model_ids) * len(dataset_configs)
-        num_finished_benchmarks = 0
-        current_benchmark_results: list[BenchmarkResult] = list()
-        for model_id in model_ids:
-            # Load the model configuration, or skip the model if it is invalid
+        # Get all the model configs
+        model_configs: list[ModelConfig] = list()
+        for model_id in get_pbar(
+            iterable=model_ids,
+            desc="Fetching model configurations",
+            disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
+        ):
             try:
                 model_config = get_model_config(
                     model_id=model_id, benchmark_config=benchmark_config
                 )
+                model_configs.append(model_config)
             except InvalidModel as e:
-                logger.info(e.message)
-                num_finished_benchmarks += len(dataset_configs)
+                log(e.message, level=logging.ERROR)
+        # Create a dictionary that takes each model config to the dataset configs that
+        # we need to benchmark the model on. Here we remove the datasets that the model
+        # has already been benchmarked on, or datasets that the model cannot be
+        # benchmarked on.
+        model_config_to_dataset_configs: dict[ModelConfig, list[DatasetConfig]] = {
+            model_config: [
+                dataset_config
+                for dataset_config in dataset_configs
+                if (
+                    benchmark_config.force
+                    or not model_has_been_benchmarked(
+                        model_config=model_config,
+                        dataset_config=dataset_config,
+                        benchmark_config=benchmark_config,
+                        benchmark_results=self.benchmark_results,
+                    )
+                )
+                and model_config.model_type in dataset_config.allowed_model_types
+            ]
+            for model_config in model_configs
+        }
+        total_benchmarks = sum(
+            len(dataset_configs)
+            for dataset_configs in model_config_to_dataset_configs.values()
+        )
+        if total_benchmarks == 0:
+            log(
+                "No benchmarks to run, as all the selected models have already been "
+                "benchmarked on all the selected datasets.",
+                level=logging.INFO,
+            )
+            return list()
+        num_finished_benchmarks = 0
+        current_benchmark_results: list[BenchmarkResult] = list()
+        benchmark_params_to_revert: dict[str, t.Any] = dict()
+        for model_config in model_configs:
+            if not model_config_to_dataset_configs[model_config]:
+                log(
+                    f"Skipping model {model_config.model_id!r} because it has "
+                    "already been benchmarked on all valid datasets.",
+                    level=logging.DEBUG,
+                )
                 continue
             if model_config.adapter_base_model_id:
                 open_issue_msg = (
-                    "If offline support is important to you, please "
-                    "consider opening an issue at https://github.com/EuroEval/EuroEval/issues."
+                    "If offline support is important to you, please consider opening "
+                    "an issue at https://github.com/EuroEval/EuroEval/issues."
                 )
                 if not internet_connection_available():
                     raise InvalidModel(
                         "Offline benchmarking of models with adapters is not currently "
-                        "supported. "
-                        f"An active internet connection is required. {open_issue_msg}"
+                        "supported. An active internet connection is required. "
+                        "{open_issue_msg}"
                     )
                 elif benchmark_config.download_only:
                     log_once(
                         "You are using download only mode with a model that includes "
-                        "an adapter. "
-                        "Please note: Offline benchmarking of adapter models is not "
-                        "currently supported. "
-                        "An internet connection will be required during evaluation. "
+                        "an adapter. Please note that offline benchmarking of "
+                        "adapter models is not currently supported - an internet "
+                        "connection will be required during evaluation in this case. "
                         f"{open_issue_msg}",
                         level=logging.WARNING,
                     )
             loaded_model: BenchmarkModule | None = None
-            benchmark_params_to_revert: dict[str, t.Any] = dict()
-            for dataset_config in dataset_configs:
+            for dataset_config in model_config_to_dataset_configs[model_config]:
                 # Revert any changes to the benchmark configuration made for the
                 # previous dataset
                 for param, value in benchmark_params_to_revert.items():
@@ -658,60 +704,28 @@ class Benchmarker:
                     "val" not in dataset_config.splits
                     and not benchmark_config.evaluate_test_split
                 ):
-                    logger.debug(
+                    log(
                         "The dataset does not have a validation split, so even though "
                         "you requested evaluating the validation split (the default), "
-                        "we will evaluate on the test split."
+                        "we will evaluate on the test split.",
+                        level=logging.DEBUG,
                     )
                     benchmark_params_to_revert["evaluate_test_split"] = False
                     benchmark_config.evaluate_test_split = True
                 if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
-                    logger.debug(
+                    log(
                         "The task requires zero-shot evaluation, so even though you "
                         "requested few-shot evaluation (the default), we will evaluate "
-                        "zero-shot."
+                        "zero-shot.",
+                        level=logging.DEBUG,
                     )
                     benchmark_params_to_revert["few_shot"] = True
                     benchmark_config.few_shot = False
-                # Skip if we have already benchmarked this model on this dataset and
-                # we are not forcing the benchmark
-                if not benchmark_config.force and model_has_been_benchmarked(
-                    model_id=model_id,
-                    dataset=dataset_config.name,
-                    few_shot=benchmark_config.few_shot,
-                    validation_split=not benchmark_config.evaluate_test_split,
-                    benchmark_results=self.benchmark_results,
-                ):
-                    logger.debug(
-                        f"Skipping benchmarking {model_id} on "
-                        f"{dataset_config.pretty_name}, as it has already been "
-                        "benchmarked."
-                    )
-                    num_finished_benchmarks += 1
-                    continue
-                # Skip if the model type should not be benchmarked on this dataset
-                model_type = model_config.model_type
-                allowed_model_types = dataset_config.allowed_model_types
-                if model_type not in allowed_model_types:
-                    logger.debug(
-                        f"Skipping benchmarking {model_id} on "
-                        f"{dataset_config.pretty_name}, as it is of type {model_type}, "
-                        f"and the only allowed model types are {allowed_model_types}."
-                    )
-                    continue
                 # We do not re-initialise generative models as their architecture is not
                 # customised to specific datasets
                 if model_config.model_type == ModelType.GENERATIVE:
-                    initial_logging(
-                        model_config=model_config,
-                        dataset_config=dataset_config,
-                        benchmark_config=benchmark_config,
-                    )
                     if loaded_model is None:
-                        logger.info("Loading model...")
                         try:
                             loaded_model = load_model(
                                 model_config=model_config,
@@ -721,7 +735,7 @@ class Benchmarker:
                         except InvalidModel as e:
                             if benchmark_config.raise_errors:
                                 raise e
-                            logger.info(e.message)
+                            log(e.message, level=logging.ERROR)
                             # Add the remaining number of benchmarks for the model to
                             # our benchmark counter, since we're skipping the rest of
@@ -735,12 +749,31 @@ class Benchmarker:
                     else:
                         loaded_model.dataset_config = dataset_config
+                    # Skip the benchmark if the model is not of the correct
+                    # generative type
+                    if (
+                        loaded_model.generative_type
+                        not in dataset_config.allowed_generative_types
+                    ):
+                        log(
+                            f"Skipping the benchmark of model "
+                            f"{model_config.model_id!r}on dataset "
+                            f"{dataset_config.name!r} because the model has generative "
+                            f"type {loaded_model.generative_type} and the dataset "
+                            f"only allows {dataset_config.allowed_generative_types}.",
+                            level=logging.DEBUG,
+                        )
+                        num_finished_benchmarks += 1
+                        continue
                 # Benchmark a single model on a single dataset
                 benchmark_output_or_err = self._benchmark_single(
                     model=loaded_model,
                     model_config=model_config,
                     dataset_config=dataset_config,
                     benchmark_config=benchmark_config,
+                    num_finished_benchmarks=num_finished_benchmarks,
+                    num_total_benchmarks=total_benchmarks,
                 )
                 if (
@@ -750,12 +783,12 @@ class Benchmarker:
                     raise benchmark_output_or_err
                 elif isinstance(benchmark_output_or_err, InvalidBenchmark):
-                    logger.info(benchmark_output_or_err.message)
+                    log(benchmark_output_or_err.message, level=logging.WARNING)
                     num_finished_benchmarks += 1
                     continue
                 elif isinstance(benchmark_output_or_err, InvalidModel):
-                    logger.info(benchmark_output_or_err.message)
+                    log(benchmark_output_or_err.message, level=logging.WARNING)
                     # Add the remaining number of benchmarks for the model to our
                     # benchmark counter, since we're skipping the rest of them
@@ -771,15 +804,13 @@ class Benchmarker:
                         record.append_to_results(results_path=self.results_path)
                 num_finished_benchmarks += 1
-                logger.info(
-                    f"Finished {num_finished_benchmarks} out of "
-                    f"{total_benchmarks} benchmarks."
-                )
             del loaded_model
             if benchmark_config.clear_model_cache:
                 clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
+        log(f"Completed {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO)
         # This avoids the following warning at the end of the benchmarking:
         #   Warning: WARNING: process group has NOT been destroyed before we destruct
         #   ProcessGroupNCCL. On normal program exit, the application should call
@@ -823,6 +854,8 @@ class Benchmarker:
         model_config: "ModelConfig",
         dataset_config: "DatasetConfig",
         benchmark_config: "BenchmarkConfig",
+        num_finished_benchmarks: int,
+        num_total_benchmarks: int,
     ) -> BenchmarkResult | InvalidBenchmark | InvalidModel:
         """Benchmark a single model on a single dataset.
@@ -835,25 +868,29 @@ class Benchmarker:
                 The configuration of the dataset we are evaluating on.
             benchmark_config:
                 The general benchmark configuration.
+            num_finished_benchmarks:
+                The number of benchmarks that have already been completed.
+            num_total_benchmarks:
+                The total number of benchmarks to be completed.
         Returns:
             The benchmark result, or an error if the benchmark was unsuccessful.
-        """
-        if model is None:
-            initial_logging(
-                model_config=model_config,
-                dataset_config=dataset_config,
-                benchmark_config=benchmark_config,
-            )
-        while True:
+        Raises:
+            RuntimeError:
+                If the MPS fallback is not enabled when required.
+            InvalidBenchmark:
+                If the benchmark was unsuccessful.
+            InvalidModel:
+                If the model is invalid.
+        """
+        for _ in range(num_attempts := 5):
             try:
                 # Set random seeds to enforce reproducibility of the randomly
                 # initialised weights
                 rng = enforce_reproducibility()
                 if model is None or model_config.model_type != ModelType.GENERATIVE:
-                    logger.info("Loading model...")
                     model = load_model(
                         model_config=model_config,
                         dataset_config=dataset_config,
@@ -861,6 +898,14 @@ class Benchmarker:
                     )
                 assert model is not None
+                initial_logging(
+                    model_config=model_config,
+                    dataset_config=dataset_config,
+                    benchmark_config=benchmark_config,
+                    num_finished_benchmarks=num_finished_benchmarks,
+                    num_total_benchmarks=num_total_benchmarks,
+                )
                 if dataset_config.task == SPEED:
                     scores = benchmark_speed(
                         model=model, benchmark_config=benchmark_config
@@ -928,14 +973,15 @@ class Benchmarker:
                     few_shot=benchmark_config.few_shot,
                     validation_split=not benchmark_config.evaluate_test_split,
                 )
-                logger.debug(f"Results:\n{results}")
+                log(f"Results:\n{results}", level=logging.DEBUG)
                 return record
             except HuggingFaceHubDown:
                 wait_time = 30
-                logger.debug(
+                log(
                     f"The Hugging Face Hub seems to be down. Retrying in {wait_time} "
-                    "seconds."
+                    "seconds.",
+                    level=logging.DEBUG,
                 )
                 sleep(wait_time)
                 continue
@@ -958,34 +1004,37 @@ class Benchmarker:
                 elif benchmark_config.raise_errors:
                     raise e
                 return e
+        else:
+            return InvalidBenchmark(
+                f"Failed to benchmark model {model_config.model_id!r} on dataset "
+                f"{dataset_config.name!r} after {num_attempts} attempts."
+            )
     def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any:  # noqa: ANN401
         """Alias for `self.benchmark()`."""
-        logger.warning(
+        log(
             "Calling the `Benchmarker` class directly is deprecated. Please use the "
-            "`benchmark` function instead. This will be removed in a future version."
+            "`benchmark` function instead. This will be removed in a future version.",
+            level=logging.WARNING,
         )
         return self.benchmark(*args, **kwds)
 def model_has_been_benchmarked(
-    model_id: str,
-    dataset: str,
-    few_shot: bool,
-    validation_split: bool,
+    model_config: "ModelConfig",
+    dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
     benchmark_results: list[BenchmarkResult],
 ) -> bool:
     """Checks whether a model has already been benchmarked on a dataset.
     Args:
-        model_id:
-            The model ID.
-        dataset:
-            The dataset.
-        few_shot:
-            Whether the model was evaluated using few-shot evaluation.
-        validation_split:
-            Whether the model was evaluated on the validation split.
+        model_config:
+            The configuration of the model we are evaluating.
+        dataset_config:
+            The configuration of the dataset we are evaluating on.
+        benchmark_config:
+            The general benchmark configuration.
         benchmark_results:
             The benchmark results.
@@ -993,36 +1042,32 @@ def model_has_been_benchmarked(
         Whether the model has already been evaluated on the dataset.
     """
     for record in benchmark_results:
-        same_evaluation = record.model == model_id and record.dataset == dataset
-        same_validation_split_setting = record.validation_split == validation_split
-        same_few_shot_setting = record.few_shot == few_shot or not record.generative
-        if same_evaluation and same_validation_split_setting and same_few_shot_setting:
+        model_id_components = split_model_id(model_id=record.model)
+        same_model_id = model_id_components.model_id == model_config.model_id
+        same_revision = model_id_components.revision == model_config.revision
+        same_param = model_id_components.param == model_config.param
+        same_dataset = record.dataset == dataset_config.name
+        same_split = (
+            record.validation_split != benchmark_config.evaluate_test_split
+            or "val" not in dataset_config.splits
+        )
+        same_num_shots = (
+            record.few_shot == benchmark_config.few_shot
+            or not record.generative
+            or dataset_config.task.requires_zero_shot
+        )
+        if (
+            same_model_id
+            and same_revision
+            and same_param
+            and same_dataset
+            and same_split
+            and same_num_shots
+        ):
             return True
     return False
-def adjust_logging_level(verbose: bool, ignore_testing: bool = False) -> int:
-    """Adjust the logging level based on verbosity.
-    Args:
-        verbose:
-            Whether to output additional output.
-        ignore_testing:
-            Whether to ignore the testing flag.
-    Returns:
-        The logging level that was set.
-    """
-    if hasattr(sys, "_called_from_test") and not ignore_testing:
-        logging_level = logging.CRITICAL
-    elif verbose:
-        logging_level = logging.DEBUG
-    else:
-        logging_level = logging.INFO
-    logger.setLevel(logging_level)
-    return logging_level
 def clear_model_cache_fn(cache_dir: str) -> None:
     """Clear the model cache.
@@ -1060,6 +1105,8 @@ def initial_logging(
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
+    num_finished_benchmarks: int,
+    num_total_benchmarks: int,
 ) -> None:
     """Initial logging at the start of the benchmarking process.
@@ -1070,6 +1117,10 @@ def initial_logging(
             The configuration of the dataset we are evaluating on.
         benchmark_config:
             The general benchmark configuration.
+        num_finished_benchmarks:
+            The number of benchmarks that have already been finished.
+        num_total_benchmarks:
+            The total number of benchmarks to be run.
     """
     model_id = model_config.model_id
     if model_config.revision and model_config.revision != "main":
@@ -1086,21 +1137,25 @@ def initial_logging(
     else:
         eval_type = "Benchmarking"
-    logger.info(
-        f"{eval_type} {model_id} on the {split_type} split of "
-        f"{dataset_config.pretty_name}"
+    log_once(
+        f"\n{eval_type} {model_id} on the {split_type} split of "
+        f"{dataset_config.pretty_name} ({num_finished_benchmarks + 1}/"
+        f"{num_total_benchmarks} benchmarks)...",
+        prefix=f"\n[{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]",
     )
     if dataset_config.unofficial:
-        logger.info(
+        log_once(
             f"Note that the {dataset_config.name!r} dataset is unofficial, "
             "meaning that the resulting evaluation will not be included in the "
-            "official leaderboard."
+            "official leaderboard.",
+            level=logging.WARNING,
         )
     if benchmark_config.debug:
-        logger.info(
+        log_once(
             "Running in debug mode. This will output additional information, as "
             "well as store the model outputs in the current directory after each "
-            "batch. For this reason, evaluation will be slower."
+            "batch. For this reason, evaluation will be slower.",
+            level=logging.WARNING,
         )

euroeval/caching_utils.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Caching utility functions."""
+import typing as t
+from functools import wraps
+from .constants import T
+def cache_arguments(
+    *arguments: str, disable_condition: t.Callable[[], bool] = lambda: False
+) -> t.Callable[[t.Callable[..., T]], t.Callable[..., T]]:
+    """Cache specified arguments of a function.
+    Args:
+        arguments:
+            The list of argument names to cache. If empty, all arguments are cached.
+        disable_condition:
+            A function that checks if cache should be disabled.
+    Returns:
+        A decorator that caches the specified arguments of a function.
+    """
+    def caching_decorator(func: t.Callable[..., T]) -> t.Callable[..., T]:
+        """Decorator that caches the specified arguments of a function.
+        Args:
+            func:
+                The function to decorate.
+        Returns:
+            The decorated function.
+        """
+        cache: dict[tuple, T] = dict()
+        @wraps(func)
+        def wrapper(*args, **kwargs) -> T:
+            """Wrapper function that caches the specified arguments.
+            Args:
+                *args:
+                    The positional arguments to the function.
+                **kwargs:
+                    The keyword arguments to the function.
+            Returns:
+                The result of the function.
+            Raises:
+                ValueError:
+                    If an argument name is not found in the function parameters.
+            """
+            if not arguments:
+                key = args + tuple(kwargs[k] for k in sorted(kwargs.keys()))
+            else:
+                func_params = func.__code__.co_varnames
+                key_items: list[t.Any] = []
+                for arg_name in arguments:
+                    if arg_name in kwargs:
+                        key_items.append(kwargs[arg_name])
+                    else:
+                        try:
+                            arg_index = func_params.index(arg_name)
+                            key_items.append(args[arg_index])
+                        except (ValueError, IndexError):
+                            raise ValueError(
+                                f"Argument {arg_name} not found in function "
+                                f"{func.__name__} parameters."
+                            )
+                key = tuple(key_items)
+            # Do not cache if the condition is met
+            if key not in cache or disable_condition():
+                cache[key] = func(*args, **kwargs)
+            return cache[key]
+        return wrapper
+    return caching_decorator

euroeval/callbacks.py CHANGED Viewed

@@ -7,6 +7,8 @@ from collections.abc import Sized
 from tqdm.auto import tqdm
 from transformers.trainer_callback import ProgressCallback
+from .logging_utils import get_pbar
 if t.TYPE_CHECKING:
     from torch.utils.data import DataLoader
     from transformers.trainer_callback import TrainerControl, TrainerState
@@ -32,11 +34,8 @@ class NeverLeaveProgressCallback(ProgressCallback):
         """Callback actions when training begins."""
         if state.is_local_process_zero:
             desc = "Finetuning model"
-            self.training_bar = tqdm(
-                total=None,
-                leave=False,
-                desc=desc,
-                disable=hasattr(sys, "_called_from_test"),
+            self.training_bar = get_pbar(
+                total=None, desc=desc, disable=hasattr(sys, "_called_from_test")
             )
         self.current_step = 0
@@ -67,9 +66,8 @@ class NeverLeaveProgressCallback(ProgressCallback):
         if state.is_local_process_zero and correct_dtype:
             if self.prediction_bar is None:
                 desc = "Evaluating model"
-                self.prediction_bar = tqdm(
+                self.prediction_bar = get_pbar(
                     total=len(eval_dataloader),
-                    leave=False,
                     desc=desc,
                     disable=hasattr(sys, "_called_from_test"),
                 )

euroeval/cli.py CHANGED Viewed

@@ -188,7 +188,7 @@ from .tasks import get_all_tasks
 )
 @click.option(
     "--gpu-memory-utilization",
-    default=0.9,
+    default=0.8,
     show_default=True,
     help="The GPU memory utilization to use for vLLM. A larger value will result in "
     "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "

EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.2.2py3-none-any.whl → 16.4.0py3-none-any.whl