PyPI - EuroEval - Versions diffs - 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl - Mend

EuroEval 16.4.0py3-none-any.whl → 16.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (71) hide show

euroeval/__init__.py +6 -0
euroeval/benchmark_config_factory.py +51 -46
euroeval/benchmark_modules/base.py +6 -5
euroeval/benchmark_modules/hf.py +2 -9
euroeval/benchmark_modules/litellm.py +14 -12
euroeval/benchmark_modules/vllm.py +17 -10
euroeval/benchmarker.py +61 -44
euroeval/caching_utils.py +1 -1
euroeval/cli.py +86 -8
euroeval/constants.py +3 -0
euroeval/data_loading.py +78 -30
euroeval/data_models.py +326 -326
euroeval/dataset_configs/__init__.py +10 -3
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/czech.py +25 -29
euroeval/dataset_configs/danish.py +51 -88
euroeval/dataset_configs/dutch.py +48 -86
euroeval/dataset_configs/english.py +45 -76
euroeval/dataset_configs/estonian.py +36 -38
euroeval/dataset_configs/faroese.py +19 -60
euroeval/dataset_configs/finnish.py +36 -68
euroeval/dataset_configs/french.py +39 -74
euroeval/dataset_configs/german.py +45 -81
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +54 -91
euroeval/dataset_configs/italian.py +42 -78
euroeval/dataset_configs/latvian.py +28 -34
euroeval/dataset_configs/lithuanian.py +22 -26
euroeval/dataset_configs/norwegian.py +72 -114
euroeval/dataset_configs/polish.py +33 -60
euroeval/dataset_configs/portuguese.py +33 -65
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +19 -24
euroeval/dataset_configs/spanish.py +42 -76
euroeval/dataset_configs/swedish.py +48 -84
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/exceptions.py +1 -1
euroeval/finetuning.py +3 -2
euroeval/generation.py +5 -4
euroeval/generation_utils.py +6 -5
euroeval/languages.py +395 -323
euroeval/metrics/huggingface.py +14 -3
euroeval/metrics/llm_as_a_judge.py +1 -1
euroeval/model_cache.py +6 -5
euroeval/model_loading.py +1 -1
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +82 -43
euroeval/prompt_templates/multiple_choice.py +81 -41
euroeval/prompt_templates/named_entity_recognition.py +125 -44
euroeval/prompt_templates/reading_comprehension.py +92 -43
euroeval/prompt_templates/sentiment_classification.py +91 -43
euroeval/prompt_templates/summarization.py +64 -39
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +4 -3
euroeval/speed_benchmark.py +2 -1
euroeval/task_group_utils/multiple_choice_classification.py +2 -1
euroeval/task_group_utils/question_answering.py +24 -13
euroeval/task_group_utils/sequence_classification.py +5 -4
euroeval/task_group_utils/text_to_text.py +2 -1
euroeval/task_group_utils/token_classification.py +11 -8
euroeval/tasks.py +44 -1
euroeval/tokenisation_utils.py +19 -10
euroeval/types.py +10 -9
euroeval/utils.py +6 -3
{euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
euroeval-16.5.0.dist-info/RECORD +81 -0
euroeval-16.4.0.dist-info/RECORD +0 -75
{euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
{euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -51,7 +51,13 @@ import importlib.metadata  # noqa: E402
 from dotenv import load_dotenv  # noqa: E402
 from .benchmarker import Benchmarker  # noqa: E402
+from .data_models import DatasetConfig  # noqa: E402
 from .logging_utils import block_terminal_output  # noqa: E402
+from .tasks import (  # noqa: E402
+    MULTIPLE_CHOICE,
+    TEXT_CLASSIFICATION,
+    TOKEN_CLASSIFICATION,
+)
 # Block unwanted terminal outputs. This blocks way more than the above, but since it
 # relies on importing from the `utils` module, external modules are already imported

euroeval/benchmark_config_factory.py CHANGED Viewed

@@ -1,19 +1,20 @@
 """Factory class for creating dataset configurations."""
+import collections.abc as c
 import sys
 import typing as t
 import torch
-from .data_models import BenchmarkConfig, BenchmarkConfigParams
+from .data_models import BenchmarkConfig, BenchmarkConfigParams, DatasetConfig, Task
 from .dataset_configs import get_all_dataset_configs
 from .enums import Device
 from .exceptions import InvalidBenchmark
 from .languages import get_all_languages
-from .tasks import SPEED, get_all_tasks
+from .tasks import get_all_tasks
 if t.TYPE_CHECKING:
-    from .data_models import Language, Task
+    from .data_models import Language
 def build_benchmark_config(
@@ -40,7 +41,7 @@ def build_benchmark_config(
         default_language_codes=language_codes,
     )
-    tasks, datasets = prepare_tasks_and_datasets(
+    dataset_configs = prepare_dataset_configs(
         task=benchmark_config_params.task,
         dataset=benchmark_config_params.dataset,
         dataset_languages=dataset_languages,
@@ -49,8 +50,7 @@ def build_benchmark_config(
     return BenchmarkConfig(
         model_languages=model_languages,
         dataset_languages=dataset_languages,
-        tasks=tasks,
-        datasets=datasets,
+        datasets=dataset_configs,
         batch_size=benchmark_config_params.batch_size,
         raise_errors=benchmark_config_params.raise_errors,
         cache_dir=benchmark_config_params.cache_dir,
@@ -80,7 +80,9 @@ def build_benchmark_config(
     )
-def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
+def get_correct_language_codes(
+    language_codes: str | c.Sequence[str],
+) -> c.Sequence[str]:
     """Get correct language code(s).
     Args:
@@ -101,7 +103,7 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
     elif isinstance(language_codes, str):
         languages = [language_codes]
     else:
-        languages = language_codes
+        languages = list(language_codes)
     # If `languages` contains 'no' then also include 'nb' and 'nn'. Conversely, if
     # either 'nb' or 'nn' are specified then also include 'no'.
@@ -114,8 +116,9 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
 def prepare_languages(
-    language_codes: str | list[str] | None, default_language_codes: list[str]
-) -> list["Language"]:
+    language_codes: str | c.Sequence[str] | None,
+    default_language_codes: c.Sequence[str],
+) -> c.Sequence["Language"]:
     """Prepare language(s) for benchmarking.
     Args:
@@ -133,7 +136,7 @@ def prepare_languages(
     language_mapping = get_all_languages()
     # Create the list `languages_str` of language codes to use for models or datasets
-    languages_str: list[str]
+    languages_str: c.Sequence[str]
     if language_codes is None:
         languages_str = default_language_codes
     elif isinstance(language_codes, str):
@@ -150,12 +153,12 @@ def prepare_languages(
     return prepared_languages
-def prepare_tasks_and_datasets(
-    task: str | list[str] | None,
-    dataset_languages: list["Language"],
-    dataset: str | list[str] | None,
-) -> tuple[list["Task"], list[str]]:
-    """Prepare task(s) and dataset(s) for benchmarking.
+def prepare_dataset_configs(
+    task: "str | Task | c.Sequence[str | Task] | None",
+    dataset_languages: c.Sequence["Language"],
+    dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None",
+) -> c.Sequence["DatasetConfig"]:
+    """Prepare dataset config(s) for benchmarking.
     Args:
         task:
@@ -168,56 +171,58 @@ def prepare_tasks_and_datasets(
             included, limited by the `task` and `dataset_languages` parameters.
     Returns:
-        The prepared tasks and datasets.
+        The prepared dataset configs.
     Raises:
         InvalidBenchmark:
             If the task or dataset is not found in the benchmark tasks or datasets.
     """
-    # Create a dictionary that maps benchmark tasks to their associated benchmark
-    # task objects, and a dictionary that maps dataset names to their associated
-    # dataset configuration objects
-    task_mapping = get_all_tasks()
-    all_dataset_configs = get_all_dataset_configs()
     # Create the list of dataset tasks
+    task_mapping = get_all_tasks()
     try:
         if task is None:
-            tasks = [t for t in task_mapping.values() if t != SPEED]
+            tasks = None
         elif isinstance(task, str):
             tasks = [task_mapping[task]]
+        elif isinstance(task, Task):
+            tasks = [task]
         else:
-            tasks = [task_mapping[t] for t in task]
+            tasks = [task_mapping[t] if isinstance(t, str) else t for t in task]
     except KeyError as e:
         raise InvalidBenchmark(f"Task {e} not found in the benchmark tasks.") from e
-    all_official_datasets = [
-        dataset_name
-        for dataset_name, dataset_config in all_dataset_configs.items()
+    # Create the list of dataset configs
+    all_dataset_configs = get_all_dataset_configs()
+    all_official_dataset_configs: c.Sequence[DatasetConfig] = [
+        dataset_config
+        for dataset_config in all_dataset_configs.values()
         if not dataset_config.unofficial
     ]
-    if dataset is None:
-        dataset = all_official_datasets
-    elif isinstance(dataset, str):
-        dataset = [dataset]
-    all_datasets = list(all_dataset_configs.keys())
-    invalid_datasets = set(dataset) - set(all_datasets)
-    if invalid_datasets:
+    try:
+        if dataset is None:
+            datasets = all_official_dataset_configs
+        elif isinstance(dataset, str):
+            datasets = [all_dataset_configs[dataset]]
+        elif isinstance(dataset, DatasetConfig):
+            datasets = [dataset]
+        else:
+            datasets = [
+                all_dataset_configs[d] if isinstance(d, str) else d for d in dataset
+            ]
+    except KeyError as e:
         raise InvalidBenchmark(
-            f"Dataset(s) {', '.join(invalid_datasets)} not found in the benchmark "
-            "datasets."
-        )
+            f"Dataset {e} not found in the benchmark datasets."
+        ) from e
+    # Filter the dataset configs based on the specified tasks and languages
     datasets = [
-        dataset_name
-        for dataset_name, dataset_config in all_dataset_configs.items()
-        if dataset_name in dataset
-        and dataset_config.task in tasks
-        and set(dataset_config.languages).intersection(dataset_languages)
+        ds
+        for ds in datasets
+        if (tasks is None or ds.task in tasks)
+        and any(lang in dataset_languages for lang in ds.languages)
     ]
-    return tasks, datasets
+    return datasets
 def prepare_device(device: Device | None) -> torch.device:

euroeval/benchmark_modules/base.py CHANGED Viewed

@@ -52,7 +52,7 @@ class BenchmarkModule(ABC):
     fresh_model: bool
     batching_preference: "BatchingPreference"
     high_priority: bool
-    allowed_params: dict[re.Pattern, list[str]] = {re.compile(r".*"): []}
+    allowed_params: dict[re.Pattern, c.Sequence[str]] = {re.compile(r".*"): []}
     def __init__(
         self,
@@ -83,11 +83,12 @@ class BenchmarkModule(ABC):
     def _log_metadata(self) -> None:
         """Log the metadata of the model."""
+        model_id = self.model_config.model_id
         logging_msg: str = "    ↳ "
         if self.num_params < 0:
-            logging_msg += "The model has an unknown number of parameters, "
+            logging_msg += f"The model {model_id} has an unknown number of parameters, "
         else:
-            logging_msg += f"The model has {self.num_params:,} parameters, "
+            logging_msg += f"The model {model_id} has {self.num_params:,} parameters, "
         if self.vocab_size < 0:
             logging_msg += "an unknown vocabulary size, "
         else:
@@ -166,7 +167,7 @@ class BenchmarkModule(ABC):
     @property
     @abstractmethod
-    def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
+    def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.
         Returns:
@@ -240,7 +241,7 @@ class BenchmarkModule(ABC):
     def prepare_datasets(
         self, datasets: list[DatasetDict], task: "Task"
-    ) -> list[DatasetDict]:
+    ) -> c.Sequence[DatasetDict]:
         """Prepare the datasets for the model.
         This includes things like tokenisation.

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -267,7 +267,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         return model_max_length
     @property
-    def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
+    def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.
         Returns:
@@ -775,15 +775,8 @@ def get_model_repo_info(
                         level=logging.DEBUG,
                     )
                     return None
-            except (RepositoryNotFoundError, HFValidationError):
+            except (RepositoryNotFoundError, HFValidationError, HfHubHTTPError):
                 return None
-            except HfHubHTTPError as e:
-                if "unauthorized" in str(e).lower():
-                    raise InvalidModel(
-                        "It seems like your specified Hugging Face API key is invalid. "
-                        "Please double-check your API key."
-                    ) from e
-                raise InvalidModel(str(e)) from e
             except (OSError, RequestException) as e:
                 if internet_connection_available():
                     errors.append(e)

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -310,7 +310,7 @@ class LiteLLMModel(BenchmarkModule):
             InvalidBenchmark:
                 If the inputs do not contain either 'messages' or 'text' keys.
         """
-        model_inputs: list[list[litellm.AllMessageValues] | str]
+        model_inputs: c.Sequence[c.Sequence[litellm.AllMessageValues] | str]
         if "messages" in inputs:
             model_inputs = inputs["messages"]
         elif "text" in inputs:
@@ -331,9 +331,9 @@ class LiteLLMModel(BenchmarkModule):
         )
         all_responses: dict[int, "ModelResponse"] = {}
-        inputs_to_run: list[tuple[int, list[litellm.AllMessageValues] | str]] = list(
-            enumerate(model_inputs)
-        )
+        inputs_to_run: c.Sequence[
+            tuple[int, c.Sequence[litellm.AllMessageValues] | str]
+        ] = list(enumerate(model_inputs))
         for attempt in range(num_attempts := 10):
             if not inputs_to_run:
                 break
@@ -540,7 +540,7 @@ class LiteLLMModel(BenchmarkModule):
             )
             ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
             keys_and_their_types = {
-                tag_name: (list[str], ...) for tag_name in ner_tag_names
+                tag_name: (c.Sequence[str], ...) for tag_name in ner_tag_names
             }
             pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
             generation_kwargs["response_format"] = pydantic_class
@@ -686,9 +686,11 @@ class LiteLLMModel(BenchmarkModule):
     async def _generate_async(
         self,
         model_id: str,
-        inputs: list[list[litellm.AllMessageValues] | str],
+        inputs: c.Sequence[c.Sequence[litellm.AllMessageValues] | str],
         **generation_kwargs,
-    ) -> tuple[list[tuple[int, "ModelResponse"]], list[tuple[int, Exception]]]:
+    ) -> tuple[
+        c.Sequence[tuple[int, "ModelResponse"]], c.Sequence[tuple[int, Exception]]
+    ]:
         """Generate outputs from the model asynchronously.
         Args:
@@ -789,7 +791,7 @@ class LiteLLMModel(BenchmarkModule):
     @staticmethod
     def _create_model_output(
-        model_responses: list["ModelResponse"], model_id: str
+        model_responses: c.Sequence["ModelResponse"], model_id: str
     ) -> GenerativeModelOutput:
         """Create a GenerativeModelOutput object from a list of ModelResponse objects.
@@ -863,7 +865,7 @@ class LiteLLMModel(BenchmarkModule):
                     )
                     continue
-                logprobs_list: list[list[tuple[str, float]]]
+                logprobs_list: c.Sequence[c.Sequence[tuple[str, float]]]
                 if isinstance(logprobs_obj, ChoiceLogprobs):
                     logprobs_list = [
                         [
@@ -1159,7 +1161,7 @@ class LiteLLMModel(BenchmarkModule):
         return -1
     @property
-    def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
+    def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.
         Returns:
@@ -1545,7 +1547,7 @@ class LiteLLMModel(BenchmarkModule):
         # First attempt is a test run with a single conversation to handle errors
         # quickly. We repeat this multiple times to deal with different types of
         # errors, and stop if we get a successful response.
-        test_input: list[litellm.AllMessageValues] | str
+        test_input: c.Sequence[litellm.AllMessageValues] | str
         if self.generative_type == GenerativeType.BASE:
             test_input = "Test message"
         else:
@@ -1604,7 +1606,7 @@ def try_download_ollama_model(model_id: str) -> bool:
         )
     try:
-        downloaded_ollama_models: list[str] = [
+        downloaded_ollama_models: c.Sequence[str] = [
             model_obj.model
             for model_obj in ollama.list().models
             if model_obj.model is not None

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -416,12 +416,18 @@ class VLLMModel(HuggingFaceEncoderModel):
                 json=structured_generation_schema
             )
         elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
-            structured_outputs = StructuredOutputsParams(
-                choice=[
-                    self.dataset_config.prompt_label_mapping[label]
-                    for label in self.dataset_config.labels
+            choice_labels = [
+                self.dataset_config.prompt_label_mapping[label]
+                for label in self.dataset_config.labels
+            ]
+            if "first_label_token_mapping" in self.buffer and isinstance(
+                self.buffer["first_label_token_mapping"], dict
+            ):
+                choice_labels = [
+                    self.buffer["first_label_token_mapping"][label]
+                    for label in choice_labels
                 ]
-            )
+            structured_outputs = StructuredOutputsParams(choice=choice_labels)
             log_once(
                 "Using structured generation with the choices: "
                 f"{structured_outputs.choice!r}.",
@@ -452,7 +458,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         # If any of the prompts are empty then we need to replace them with a BOS token
         # so that the vLLM model can generate from them
-        prompts: list[str] = inputs["text"]
+        prompts: c.Sequence[str] = inputs["text"]
         if any(len(prompt) == 0 for prompt in prompts):
             log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
             prompts = [
@@ -556,13 +562,14 @@ class VLLMModel(HuggingFaceEncoderModel):
                 )
         # Parse the raw model outputs
-        completion_ids: list[list[int]] = [
+        completion_ids: c.Sequence[c.Sequence[int]] = [
             list(output.outputs[0].token_ids) for output in raw_outputs
         ]
         completions = self._tokeniser.batch_decode(
             sequences=[
                 torch.LongTensor(completion_id) for completion_id in completion_ids
-            ]
+            ],
+            skip_special_tokens=True,
         )
         if (
             self.end_of_reasoning_token is not None
@@ -608,7 +615,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         # Add logprobs scores to the output
         if self.buffer["first_label_token_mapping"]:
-            scores: list[list[list[tuple[str, float]]]] = [
+            scores: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]] = [
                 [
                     [
                         (obj.decoded_token or "", obj.logprob)
@@ -719,7 +726,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         return model_config
     @property
-    def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
+    def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.
         Returns:

euroeval/benchmarker.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Class that benchmarks language models."""
+import collections.abc as c
 import contextlib
 import datetime as dt
 import json
@@ -38,7 +39,7 @@ from .utils import (
 if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
-    from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
+    from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig, Task
 class Benchmarker:
@@ -62,11 +63,11 @@ class Benchmarker:
         self,
         progress_bar: bool = True,
         save_results: bool = True,
-        task: str | list[str] | None = None,
-        dataset: list[str] | str | None = None,
-        language: str | list[str] = "all",
-        model_language: str | list[str] | None = None,
-        dataset_language: str | list[str] | None = None,
+        task: "str | Task | c.Sequence[str | Task] | None" = None,
+        dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
+        language: str | c.Sequence[str] = "all",
+        model_language: str | c.Sequence[str] | None = None,
+        dataset_language: str | c.Sequence[str] | None = None,
         device: Device | None = None,
         batch_size: int = 32,
         raise_errors: bool = False,
@@ -176,6 +177,8 @@ class Benchmarker:
             ValueError:
                 If both `task` and `dataset` are specified, or if `download_only`
                 is True and we have no internet connection.
+            ImportError:
+                If `hf_transfer` is enabled but not installed.
         """
         if task is not None and dataset is not None:
             raise ValueError("Only one of `task` and `dataset` can be specified.")
@@ -236,13 +239,13 @@ class Benchmarker:
         )
         # Initialise variable storing model lists, so we only have to fetch it once
-        self._model_lists: dict[str, list[str]] | None = None
+        self._model_lists: dict[str, c.Sequence[str]] | None = None
         self.results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
         adjust_logging_level(verbose=self.benchmark_config.verbose)
     @property
-    def benchmark_results(self) -> list[BenchmarkResult]:
+    def benchmark_results(self) -> c.Sequence[BenchmarkResult]:
         """The benchmark results.
         Returns:
@@ -320,14 +323,14 @@ class Benchmarker:
     def benchmark(
         self,
-        model: list[str] | str,
-        task: str | list[str] | None = None,
-        dataset: list[str] | str | None = None,
+        model: c.Sequence[str] | str,
+        task: "str | Task | c.Sequence[str | Task] | None" = None,
+        dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
         progress_bar: bool | None = None,
         save_results: bool | None = None,
-        language: str | list[str] | None = None,
-        model_language: str | list[str] | None = None,
-        dataset_language: str | list[str] | None = None,
+        language: str | c.Sequence[str] | None = None,
+        model_language: str | c.Sequence[str] | None = None,
+        dataset_language: str | c.Sequence[str] | None = None,
         device: Device | None = None,
         batch_size: int | None = None,
         raise_errors: bool | None = None,
@@ -347,7 +350,7 @@ class Benchmarker:
         force: bool | None = None,
         verbose: bool | None = None,
         debug: bool | None = None,
-    ) -> list[BenchmarkResult]:
+    ) -> c.Sequence[BenchmarkResult]:
         """Benchmarks models on datasets.
         Args:
@@ -605,9 +608,7 @@ class Benchmarker:
             clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
         model_ids = self._prepare_model_ids(model_id=model)
-        dataset_configs = prepare_dataset_configs(
-            dataset_names=benchmark_config.datasets
-        )
+        dataset_configs = benchmark_config.datasets
         # Get all the model configs
         model_configs: list[ModelConfig] = list()
@@ -625,27 +626,40 @@ class Benchmarker:
                 log(e.message, level=logging.ERROR)
         # Create a dictionary that takes each model config to the dataset configs that
-        # we need to benchmark the model on. Here we remove the datasets that the model
-        # has already been benchmarked on, or datasets that the model cannot be
-        # benchmarked on.
-        model_config_to_dataset_configs: dict[ModelConfig, list[DatasetConfig]] = {
+        # we need to benchmark the model on. We initially include all the relevant
+        # datasets for each model.
+        model_config_to_dataset_configs: dict[
+            ModelConfig, c.Sequence[DatasetConfig]
+        ] = {
             model_config: [
                 dataset_config
                 for dataset_config in dataset_configs
-                if (
-                    benchmark_config.force
-                    or not model_has_been_benchmarked(
-                        model_config=model_config,
-                        dataset_config=dataset_config,
-                        benchmark_config=benchmark_config,
-                        benchmark_results=self.benchmark_results,
-                    )
-                )
-                and model_config.model_type in dataset_config.allowed_model_types
+                if model_config.model_type in dataset_config.allowed_model_types
             ]
             for model_config in model_configs
         }
+        # Initialise the current benchmark results with all the ones that we have cached
+        # on disk already (can be none), and remove those datasets from the mapping
+        current_benchmark_results: list[BenchmarkResult] = list()
+        for (
+            model_config,
+            model_dataset_configs,
+        ) in model_config_to_dataset_configs.items():
+            new_model_dataset_configs: list[DatasetConfig] = list()
+            for dataset_config in model_dataset_configs:
+                benchmark_record = get_record(
+                    model_config=model_config,
+                    dataset_config=dataset_config,
+                    benchmark_config=benchmark_config,
+                    benchmark_results=self.benchmark_results,
+                )
+                if benchmark_record is not None and not benchmark_config.force:
+                    current_benchmark_results.append(benchmark_record)
+                else:
+                    new_model_dataset_configs.append(dataset_config)
+            model_config_to_dataset_configs[model_config] = new_model_dataset_configs
         total_benchmarks = sum(
             len(dataset_configs)
             for dataset_configs in model_config_to_dataset_configs.values()
@@ -656,10 +670,9 @@ class Benchmarker:
                 "benchmarked on all the selected datasets.",
                 level=logging.INFO,
             )
-            return list()
+            return current_benchmark_results
         num_finished_benchmarks = 0
-        current_benchmark_results: list[BenchmarkResult] = list()
         benchmark_params_to_revert: dict[str, t.Any] = dict()
         for model_config in model_configs:
             if not model_config_to_dataset_configs[model_config]:
@@ -809,7 +822,9 @@ class Benchmarker:
             if benchmark_config.clear_model_cache:
                 clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
-        log(f"Completed {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO)
+        log(
+            f"\nCompleted {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO
+        )
         # This avoids the following warning at the end of the benchmarking:
         #   Warning: WARNING: process group has NOT been destroyed before we destruct
@@ -823,7 +838,7 @@ class Benchmarker:
             destroy_process_group()
         return current_benchmark_results
-    def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
+    def _prepare_model_ids(self, model_id: c.Sequence[str] | str) -> c.Sequence[str]:
         """Prepare the model ID(s) to be benchmarked.
         Args:
@@ -1020,13 +1035,13 @@ class Benchmarker:
         return self.benchmark(*args, **kwds)
-def model_has_been_benchmarked(
+def get_record(
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
-    benchmark_results: list[BenchmarkResult],
-) -> bool:
-    """Checks whether a model has already been benchmarked on a dataset.
+    benchmark_results: c.Sequence[BenchmarkResult],
+) -> BenchmarkResult | None:
+    """Get the benchmark record for a given model and dataset.
     Args:
         model_config:
@@ -1039,7 +1054,7 @@ def model_has_been_benchmarked(
             The benchmark results.
     Returns:
-        Whether the model has already been evaluated on the dataset.
+        The benchmark record, or None if no such record exists.
     """
     for record in benchmark_results:
         model_id_components = split_model_id(model_id=record.model)
@@ -1064,8 +1079,8 @@ def model_has_been_benchmarked(
             and same_split
             and same_num_shots
         ):
-            return True
-    return False
+            return record
+    return None
 def clear_model_cache_fn(cache_dir: str) -> None:
@@ -1086,7 +1101,9 @@ def clear_model_cache_fn(cache_dir: str) -> None:
                     rmtree(sub_model_dir)
-def prepare_dataset_configs(dataset_names: list[str]) -> list["DatasetConfig"]:
+def prepare_dataset_configs(
+    dataset_names: c.Sequence[str],
+) -> c.Sequence["DatasetConfig"]:
     """Prepare the dataset configuration(s) to be benchmarked.
     Args:

euroeval/caching_utils.py CHANGED Viewed

@@ -54,7 +54,7 @@ def cache_arguments(
                 key = args + tuple(kwargs[k] for k in sorted(kwargs.keys()))
             else:
                 func_params = func.__code__.co_varnames
-                key_items: list[t.Any] = []
+                key_items: list[t.Any] = list()
                 for arg_name in arguments:
                     if arg_name in kwargs:
                         key_items.append(kwargs[arg_name])

EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.4.0py3-none-any.whl → 16.5.0py3-none-any.whl