PyPI - EuroEval - Versions diffs - 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl - Mend

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

euroeval/__init__.py +32 -14
euroeval/benchmark_config_factory.py +92 -180
euroeval/benchmark_modules/base.py +49 -39
euroeval/benchmark_modules/fresh.py +35 -21
euroeval/benchmark_modules/hf.py +280 -244
euroeval/benchmark_modules/litellm.py +752 -312
euroeval/benchmark_modules/vllm.py +570 -268
euroeval/benchmarker.py +651 -528
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +49 -38
euroeval/constants.py +44 -25
euroeval/data_loading.py +111 -55
euroeval/data_models.py +490 -323
euroeval/dataset_configs/__init__.py +26 -4
euroeval/dataset_configs/bosnian.py +39 -0
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/croatian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +78 -50
euroeval/dataset_configs/dutch.py +74 -44
euroeval/dataset_configs/english.py +71 -36
euroeval/dataset_configs/estonian.py +111 -0
euroeval/dataset_configs/faroese.py +25 -18
euroeval/dataset_configs/finnish.py +63 -26
euroeval/dataset_configs/french.py +65 -32
euroeval/dataset_configs/german.py +77 -36
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +68 -57
euroeval/dataset_configs/italian.py +68 -36
euroeval/dataset_configs/latvian.py +87 -0
euroeval/dataset_configs/lithuanian.py +64 -0
euroeval/dataset_configs/norwegian.py +98 -72
euroeval/dataset_configs/polish.py +96 -0
euroeval/dataset_configs/portuguese.py +63 -40
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/slovene.py +56 -0
euroeval/dataset_configs/spanish.py +68 -34
euroeval/dataset_configs/swedish.py +82 -41
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/enums.py +12 -6
euroeval/exceptions.py +21 -1
euroeval/finetuning.py +34 -26
euroeval/generation.py +76 -41
euroeval/generation_utils.py +169 -34
euroeval/languages.py +1020 -188
euroeval/logging_utils.py +268 -0
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +85 -0
euroeval/metrics/huggingface.py +216 -0
euroeval/metrics/llm_as_a_judge.py +260 -0
euroeval/metrics/pipeline.py +289 -0
euroeval/metrics/speed.py +48 -0
euroeval/model_cache.py +40 -21
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +157 -22
euroeval/prompt_templates/multiple_choice.py +159 -17
euroeval/prompt_templates/named_entity_recognition.py +318 -21
euroeval/prompt_templates/reading_comprehension.py +207 -16
euroeval/prompt_templates/sentiment_classification.py +205 -22
euroeval/prompt_templates/summarization.py +122 -22
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +20 -9
euroeval/speed_benchmark.py +11 -12
euroeval/task_group_utils/multiple_choice_classification.py +21 -12
euroeval/task_group_utils/question_answering.py +101 -73
euroeval/task_group_utils/sequence_classification.py +144 -61
euroeval/task_group_utils/text_to_text.py +33 -12
euroeval/task_group_utils/token_classification.py +86 -89
euroeval/tasks.py +75 -16
euroeval/tokenisation_utils.py +603 -0
euroeval/types.py +17 -11
euroeval/utils.py +332 -137
euroeval-16.7.1.dist-info/METADATA +623 -0
euroeval-16.7.1.dist-info/RECORD +84 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -737
euroeval/metrics.py +0 -452
euroeval/tokenization_utils.py +0 -498
euroeval-15.12.0.dist-info/METADATA +0 -285
euroeval-15.12.0.dist-info/RECORD +0 -63
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -12,14 +12,17 @@ import warnings
 from termcolor import colored
 # Block specific warnings before importing anything else, as they can be noisy
-warnings.filterwarnings("ignore", category=UserWarning)
-logging.getLogger("httpx").setLevel(logging.CRITICAL)
-logging.getLogger("datasets").setLevel(logging.CRITICAL)
-logging.getLogger("vllm").setLevel(logging.CRITICAL)
-os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
+if os.getenv("FULL_LOG") != "1":
+    warnings.filterwarnings("ignore", category=UserWarning)
+    warnings.filterwarnings("ignore", category=FutureWarning)
+    logging.getLogger("httpx").setLevel(logging.CRITICAL)
+    logging.getLogger("datasets").setLevel(logging.CRITICAL)
+    logging.getLogger("vllm").setLevel(logging.CRITICAL)
+    os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
 # Set up logging
-fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
+# fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
+fmt = colored("%(message)s", "light_yellow")
 logging.basicConfig(
     level=logging.CRITICAL if hasattr(sys, "_called_from_test") else logging.INFO,
     format=fmt,
@@ -48,7 +51,13 @@ import importlib.metadata  # noqa: E402
 from dotenv import load_dotenv  # noqa: E402
 from .benchmarker import Benchmarker  # noqa: E402
-from .utils import block_terminal_output  # noqa: E402
+from .data_models import DatasetConfig  # noqa: E402
+from .logging_utils import block_terminal_output  # noqa: E402
+from .tasks import (  # noqa: E402
+    MULTIPLE_CHOICE,
+    TEXT_CLASSIFICATION,
+    TOKEN_CLASSIFICATION,
+)
 # Block unwanted terminal outputs. This blocks way more than the above, but since it
 # relies on importing from the `utils` module, external modules are already imported
@@ -77,15 +86,18 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["OMP_NUM_THREADS"] = "1"
-# Disable a warning from Ray regarding the detection of the number of CPUs
-os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
 # Avoid the "Cannot re-initialize CUDA in forked subprocess" error - see
 # https://github.com/vllm-project/vllm/issues/6152 for more
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+# Allow long max model length in vLLM. This happens when vLLM registers that the model
+# has a shorter context length than the value we are inserting. But since we do a
+# thorough check of the model's config before setting the context length, we trust our
+# own checks and ignore the internal vLLM check.
+os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
 # Avoid the "Unclosed client session" error when evaluating Ollama models with LiteLLM.
 # The error comes from the `aiohttp` package, and this environment variable forces the
 # use of `httpx` instead.
@@ -93,9 +105,15 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
-# Use older version v0 of vLLM, as the newer one requires XGrammar as decoding backend,
-# but XGrammar does not support having a maximal amount of elements in lists
-os.environ["VLLM_USE_V1"] = "0"
+# Enable the newer vLLM V1 engine, which is faster and offers more compatibility with
+# newer models
+os.environ["VLLM_USE_V1"] = "1"
+# Use the FlashInfer flash-attention backend for vLLM, unless the user has already
+# specified a different backend.
+if os.getenv("VLLM_ATTENTION_BACKEND") is None:
+    os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
 # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the

euroeval/benchmark_config_factory.py CHANGED Viewed

@@ -1,173 +1,82 @@
 """Factory class for creating dataset configurations."""
-import logging
+import collections.abc as c
 import sys
 import typing as t
 import torch
-from .data_models import BenchmarkConfig
+from .data_models import BenchmarkConfig, BenchmarkConfigParams, DatasetConfig, Task
 from .dataset_configs import get_all_dataset_configs
 from .enums import Device
 from .exceptions import InvalidBenchmark
 from .languages import get_all_languages
-from .tasks import SPEED, get_all_tasks
 if t.TYPE_CHECKING:
-    from .data_models import Language, Task
-logger = logging.getLogger("euroeval")
+    from .data_models import Language
 def build_benchmark_config(
-    progress_bar: bool,
-    save_results: bool,
-    task: str | list[str] | None,
-    dataset: str | list[str] | None,
-    language: str | list[str],
-    model_language: str | list[str] | None,
-    dataset_language: str | list[str] | None,
-    device: Device | None,
-    batch_size: int,
-    raise_errors: bool,
-    cache_dir: str,
-    api_key: str | None,
-    force: bool,
-    verbose: bool,
-    trust_remote_code: bool,
-    clear_model_cache: bool,
-    evaluate_test_split: bool,
-    few_shot: bool,
-    num_iterations: int,
-    api_base: str | None,
-    api_version: str | None,
-    gpu_memory_utilization: float,
-    debug: bool,
-    run_with_cli: bool,
-    only_allow_safetensors: bool,
-    first_time: bool = False,
+    benchmark_config_params: BenchmarkConfigParams,
 ) -> BenchmarkConfig:
     """Create a benchmark configuration.
     Args:
-        progress_bar:
-            Whether to show a progress bar when running the benchmark.
-        save_results:
-            Whether to save the benchmark results to a file.
-        task:
-            The tasks to include for dataset. If None then datasets will not be
-            filtered based on their task.
-        dataset:
-            The datasets to include for task. If None then all datasets will be
-            included, limited by the `task` parameter.
-        language:
-            The language codes of the languages to include, both for models and
-            datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
-            to 'all' if all languages should be considered.
-        model_language:
-            The language codes of the languages to include for models. If None then
-            the `language` parameter will be used.
-        dataset_language:
-            The language codes of the languages to include for datasets. If None then
-            the `language` parameter will be used.
-        device:
-            The device to use for running the models. If None then the device will be
-            set automatically.
-        batch_size:
-            The batch size to use for running the models.
-        raise_errors:
-            Whether to raise errors when running the benchmark.
-        cache_dir:
-            The directory to use for caching the models.
-        api_key:
-            The API key to use for a given inference server.
-        force:
-            Whether to force the benchmark to run even if the results are already
-            cached.
-        verbose:
-            Whether to print verbose output when running the benchmark. This is
-            automatically set if `debug` is True.
-        trust_remote_code:
-            Whether to trust remote code when running the benchmark.
-        clear_model_cache:
-            Whether to clear the model cache before running the benchmark.
-        evaluate_test_split:
-            Whether to use the test split for the datasets.
-        few_shot:
-            Whether to use few-shot learning for the models.
-        num_iterations:
-            The number of iterations each model should be evaluated for.
-        api_base:
-            The base URL for a given inference API. Only relevant if `model` refers to a
-            model on an inference API.
-        api_version:
-            The version of the API to use for a given inference API.
-        gpu_memory_utilization:
-            The GPU memory utilization to use for vLLM. A larger value will result in
-            faster evaluation, but at the risk of running out of GPU memory. Only reduce
-            this if you are running out of GPU memory. Only relevant if the model is
-            generative.
-        debug:
-            Whether to run the benchmark in debug mode.
-        run_with_cli:
-            Whether the benchmark is being run with the CLI.
-        only_allow_safetensors:
-            Whether to only allow evaluations of models stored as safetensors.
-        first_time:
-            Whether this is the first time the benchmark configuration is being created.
-            Defaults to False.
+        benchmark_config_params:
+            The parameters for creating the benchmark configuration.
     Returns:
         The benchmark configuration.
     """
-    language_codes = get_correct_language_codes(language_codes=language)
-    model_languages = prepare_languages(
-        language_codes=model_language, default_language_codes=language_codes
+    language_codes = get_correct_language_codes(
+        language_codes=benchmark_config_params.language
     )
-    dataset_languages = prepare_languages(
-        language_codes=dataset_language, default_language_codes=language_codes
+    languages = prepare_languages(
+        language_codes=benchmark_config_params.language,
+        default_language_codes=language_codes,
     )
-    tasks, datasets = prepare_tasks_and_datasets(
-        task=task, dataset=dataset, dataset_languages=dataset_languages
+    dataset_configs = prepare_dataset_configs(
+        task=benchmark_config_params.task,
+        dataset=benchmark_config_params.dataset,
+        languages=languages,
     )
-    torch_device = prepare_device(device=device)
-    # Set variable with number of iterations
-    if hasattr(sys, "_called_from_test"):
-        num_iterations = 1
     return BenchmarkConfig(
-        model_languages=model_languages,
-        dataset_languages=dataset_languages,
-        tasks=tasks,
-        datasets=datasets,
-        batch_size=batch_size,
-        raise_errors=raise_errors,
-        cache_dir=cache_dir,
-        api_key=api_key,
-        force=force,
-        progress_bar=progress_bar,
-        save_results=save_results,
-        verbose=verbose or debug,
-        device=torch_device,
-        trust_remote_code=trust_remote_code,
-        clear_model_cache=clear_model_cache,
-        evaluate_test_split=evaluate_test_split,
-        few_shot=few_shot,
-        num_iterations=num_iterations,
-        api_base=api_base,
-        api_version=api_version,
-        gpu_memory_utilization=gpu_memory_utilization,
-        debug=debug,
-        run_with_cli=run_with_cli,
-        only_allow_safetensors=only_allow_safetensors,
+        datasets=dataset_configs,
+        languages=languages,
+        finetuning_batch_size=benchmark_config_params.finetuning_batch_size,
+        raise_errors=benchmark_config_params.raise_errors,
+        cache_dir=benchmark_config_params.cache_dir,
+        api_key=benchmark_config_params.api_key,
+        force=benchmark_config_params.force,
+        progress_bar=benchmark_config_params.progress_bar,
+        save_results=benchmark_config_params.save_results,
+        verbose=benchmark_config_params.verbose or benchmark_config_params.debug,
+        device=prepare_device(device=benchmark_config_params.device),
+        trust_remote_code=benchmark_config_params.trust_remote_code,
+        clear_model_cache=benchmark_config_params.clear_model_cache,
+        evaluate_test_split=benchmark_config_params.evaluate_test_split,
+        few_shot=benchmark_config_params.few_shot,
+        num_iterations=(
+            1
+            if hasattr(sys, "_called_from_test")
+            else benchmark_config_params.num_iterations
+        ),
+        api_base=benchmark_config_params.api_base,
+        api_version=benchmark_config_params.api_version,
+        gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
+        generative_type=benchmark_config_params.generative_type,
+        debug=benchmark_config_params.debug,
+        run_with_cli=benchmark_config_params.run_with_cli,
+        requires_safetensors=benchmark_config_params.requires_safetensors,
+        download_only=benchmark_config_params.download_only,
     )
-def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
+def get_correct_language_codes(
+    language_codes: str | c.Sequence[str],
+) -> c.Sequence[str]:
     """Get correct language code(s).
     Args:
@@ -188,7 +97,7 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
     elif isinstance(language_codes, str):
         languages = [language_codes]
     else:
-        languages = language_codes
+        languages = list(language_codes)
     # If `languages` contains 'no' then also include 'nb' and 'nn'. Conversely, if
     # either 'nb' or 'nn' are specified then also include 'no'.
@@ -201,8 +110,9 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
 def prepare_languages(
-    language_codes: str | list[str] | None, default_language_codes: list[str]
-) -> list["Language"]:
+    language_codes: str | c.Sequence[str] | None,
+    default_language_codes: c.Sequence[str],
+) -> c.Sequence["Language"]:
     """Prepare language(s) for benchmarking.
     Args:
@@ -220,7 +130,7 @@ def prepare_languages(
     language_mapping = get_all_languages()
     # Create the list `languages_str` of language codes to use for models or datasets
-    languages_str: list[str]
+    languages_str: c.Sequence[str]
     if language_codes is None:
         languages_str = default_language_codes
     elif isinstance(language_codes, str):
@@ -237,74 +147,76 @@ def prepare_languages(
     return prepared_languages
-def prepare_tasks_and_datasets(
-    task: str | list[str] | None,
-    dataset_languages: list["Language"],
-    dataset: str | list[str] | None,
-) -> tuple[list["Task"], list[str]]:
-    """Prepare task(s) and dataset(s) for benchmarking.
+def prepare_dataset_configs(
+    task: "str | Task | c.Sequence[str | Task] | None",
+    languages: c.Sequence["Language"],
+    dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None",
+) -> c.Sequence["DatasetConfig"]:
+    """Prepare dataset config(s) for benchmarking.
     Args:
         task:
             The tasks to include for dataset. If None then datasets will not be
             filtered based on their task.
-        dataset_languages:
+        languages:
             The languages of the datasets in the benchmark.
         dataset:
             The datasets to include for task. If None then all datasets will be
-            included, limited by the `task` and `dataset_languages` parameters.
+            included, limited by the `task` and `languages` parameters.
     Returns:
-        The prepared tasks and datasets.
+        The prepared dataset configs.
     Raises:
         InvalidBenchmark:
             If the task or dataset is not found in the benchmark tasks or datasets.
     """
-    # Create a dictionary that maps benchmark tasks to their associated benchmark
-    # task objects, and a dictionary that maps dataset names to their associated
-    # dataset configuration objects
-    task_mapping = get_all_tasks()
+    # Create the list of dataset configs
     all_dataset_configs = get_all_dataset_configs()
+    all_official_dataset_configs: c.Sequence[DatasetConfig] = [
+        dataset_config
+        for dataset_config in all_dataset_configs.values()
+        if not dataset_config.unofficial
+    ]
+    try:
+        if dataset is None:
+            datasets = all_official_dataset_configs
+        elif isinstance(dataset, str):
+            datasets = [all_dataset_configs[dataset]]
+        elif isinstance(dataset, DatasetConfig):
+            datasets = [dataset]
+        else:
+            datasets = [
+                all_dataset_configs[d] if isinstance(d, str) else d for d in dataset
+            ]
+    except KeyError as e:
+        raise InvalidBenchmark(
+            f"Dataset {e} not found in the benchmark datasets."
+        ) from e
     # Create the list of dataset tasks
+    task_mapping = {cfg.task.name: cfg.task for cfg in all_dataset_configs.values()}
     try:
         if task is None:
-            tasks = [t for t in task_mapping.values() if t != SPEED]
+            tasks = None
         elif isinstance(task, str):
             tasks = [task_mapping[task]]
+        elif isinstance(task, Task):
+            tasks = [task]
         else:
-            tasks = [task_mapping[t] for t in task]
+            tasks = [task_mapping[t] if isinstance(t, str) else t for t in task]
     except KeyError as e:
         raise InvalidBenchmark(f"Task {e} not found in the benchmark tasks.") from e
-    all_official_datasets = [
-        dataset_name
-        for dataset_name, dataset_config in all_dataset_configs.items()
-        if not dataset_config.unofficial
-    ]
-    if dataset is None:
-        dataset = all_official_datasets
-    elif isinstance(dataset, str):
-        dataset = [dataset]
-    all_datasets = list(all_dataset_configs.keys())
-    invalid_datasets = set(dataset) - set(all_datasets)
-    if invalid_datasets:
-        raise InvalidBenchmark(
-            f"Dataset(s) {', '.join(invalid_datasets)} not found in the benchmark "
-            "datasets."
-        )
+    # Filter the dataset configs based on the specified tasks and languages
     datasets = [
-        dataset_name
-        for dataset_name, dataset_config in all_dataset_configs.items()
-        if dataset_name in dataset
-        and dataset_config.task in tasks
-        and set(dataset_config.languages).intersection(dataset_languages)
+        ds
+        for ds in datasets
+        if (tasks is None or ds.task in tasks)
+        and any(lang in languages for lang in ds.languages)
     ]
-    return tasks, datasets
+    return datasets
 def prepare_device(device: Device | None) -> torch.device:

euroeval/benchmark_modules/base.py CHANGED Viewed

@@ -2,24 +2,23 @@
 import collections.abc as c
 import logging
-import sys
+import re
 import typing as t
 from abc import ABC, abstractmethod
 from functools import cached_property, partial
-from datasets import DatasetDict
+from datasets import Dataset, DatasetDict
 from torch import nn
-from tqdm.auto import tqdm
 from ..enums import TaskGroup
-from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
+from ..exceptions import InvalidBenchmark, NeedsEnvironmentVariable, NeedsExtraInstalled
+from ..logging_utils import get_pbar, log_once
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
     text_to_text,
     token_classification,
 )
-from ..utils import log_once
 if t.TYPE_CHECKING:
     from transformers.tokenization_utils import PreTrainedTokenizer
@@ -35,8 +34,6 @@ if t.TYPE_CHECKING:
     from ..enums import BatchingPreference, GenerativeType
     from ..types import ComputeMetricsFunction, ExtractLabelsFunction
-logger = logging.getLogger("euroeval")
 class BenchmarkModule(ABC):
     """Abstract class for a benchmark module.
@@ -55,12 +52,14 @@ class BenchmarkModule(ABC):
     fresh_model: bool
     batching_preference: "BatchingPreference"
     high_priority: bool
+    allowed_params: dict[re.Pattern, c.Sequence[str]] = {re.compile(r".*"): []}
     def __init__(
         self,
         model_config: "ModelConfig",
         dataset_config: "DatasetConfig",
         benchmark_config: "BenchmarkConfig",
+        log_metadata: bool = True,
     ) -> None:
         """Initialise the benchmark module.
@@ -71,29 +70,25 @@ class BenchmarkModule(ABC):
                 The dataset configuration.
             benchmark_config:
                 The benchmark configuration.
+            log_metadata:
+                Whether to log the metadata of the model.
         """
         self.model_config = model_config
         self.dataset_config = dataset_config
         self.benchmark_config = benchmark_config
+        self.log_metadata = log_metadata
         self.buffer: dict[str, t.Any] = dict()
-        self._log_metadata()
+        if self.log_metadata:
+            self._log_metadata()
     def _log_metadata(self) -> None:
         """Log the metadata of the model."""
-        # Set logging level based on verbosity
-        if hasattr(sys, "_called_from_test"):
-            logging_level = logging.CRITICAL
-        elif self.benchmark_config.verbose:
-            logging_level = logging.DEBUG
-        else:
-            logging_level = logging.INFO
-        logger.setLevel(logging_level)
-        logging_msg: str = ""
+        model_id = self.model_config.model_id
+        logging_msg: str = "    ↳ "
         if self.num_params < 0:
-            logging_msg += "The model has an unknown number of parameters, "
+            logging_msg += f"The model {model_id} has an unknown number of parameters, "
         else:
-            logging_msg += f"The model has {self.num_params:,} parameters, "
+            logging_msg += f"The model {model_id} has {self.num_params:,} parameters, "
         if self.vocab_size < 0:
             logging_msg += "an unknown vocabulary size, "
         else:
@@ -117,16 +112,16 @@ class BenchmarkModule(ABC):
             f"{self.__class__.__name__}."
         )
-    def get_tokenizer(self) -> "PreTrainedTokenizer":
-        """Get the underlying tokenizer.
+    def get_tokeniser(self) -> "PreTrainedTokenizer":
+        """Get the underlying tokeniser.
         Returns:
-            The tokenizer.
+            The tokeniser.
         """
-        if hasattr(self, "_tokenizer"):
-            return self._tokenizer
+        if hasattr(self, "_tokeniser"):
+            return self._tokeniser
         raise NotImplementedError(
-            "The `get_tokenizer` method has not been implemented for "
+            "The `get_tokeniser` method has not been implemented for "
             f"{self.__class__.__name__}."
         )
@@ -172,7 +167,7 @@ class BenchmarkModule(ABC):
     @property
     @abstractmethod
-    def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
+    def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.
         Returns:
@@ -192,11 +187,13 @@ class BenchmarkModule(ABC):
                 return partial(
                     sequence_classification.compute_metrics,
                     dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
                 )
             case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
                 return partial(
                     sequence_classification.compute_metrics,
                     dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
                 )
             case TaskGroup.TEXT_TO_TEXT:
                 return partial(
@@ -209,11 +206,13 @@ class BenchmarkModule(ABC):
                     token_classification.compute_metrics,
                     has_misc_tags=self.buffer.get("has_misc_tags", True),
                     dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
                 )
             case TaskGroup.QUESTION_ANSWERING:
                 return partial(
                     question_answering.compute_metrics,
                     dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
                 )
             case _:
                 raise NotImplementedError(
@@ -242,7 +241,7 @@ class BenchmarkModule(ABC):
     def prepare_datasets(
         self, datasets: list[DatasetDict], task: "Task"
-    ) -> list[DatasetDict]:
+    ) -> c.Sequence[DatasetDict]:
         """Prepare the datasets for the model.
         This includes things like tokenisation.
@@ -255,30 +254,41 @@ class BenchmarkModule(ABC):
         Returns:
             The prepared datasets.
+        Raises:
+            InvalidBenchmark:
+                If the dataset does not have a 'train' split for token classification
+                tasks.
         """
         for idx, dataset in enumerate(
-            tqdm(iterable=datasets, desc="Preparing datasets")
+            get_pbar(
+                iterable=datasets,
+                desc="Preparing datasets",
+                disable=not self.benchmark_config.progress_bar,
+            )
         ):
             prepared_dataset = self.prepare_dataset(
                 dataset=dataset, task=task, itr_idx=idx
             )
             if self.dataset_config.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
+                if "train" not in dataset:
+                    raise InvalidBenchmark(
+                        "The dataset does not have a 'train' split, which is required "
+                        "for token classification tasks."
+                    )
                 labels_in_train: set[str] = {
                     tag for tag_list in dataset["train"]["labels"] for tag in tag_list
                 }
                 self.buffer["has_misc_tags"] = (
                     "B-MISC" in labels_in_train or "I-MISC" in labels_in_train
                 )
-            datasets[idx] = DatasetDict(
-                dict(
-                    train=prepared_dataset["train"],
-                    val=prepared_dataset["val"],
-                    test=prepared_dataset["test"],
-                    original_train=dataset["train"],
-                    original_val=dataset["val"],
-                    original_test=dataset["test"],
-                )
-            )
+            datasets_dict: dict[str, Dataset] = dict()
+            for split_name, split in prepared_dataset.items():
+                datasets_dict[split_name] = split
+            for split_name, split in dataset.items():
+                datasets_dict[f"original_{split_name}"] = split
+            datasets[idx] = DatasetDict(datasets_dict)
         return datasets
     @abstractmethod

EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl