PyPI - EuroEval - Versions diffs - 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl - Mend

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

euroeval/__init__.py +32 -14
euroeval/benchmark_config_factory.py +92 -180
euroeval/benchmark_modules/base.py +49 -39
euroeval/benchmark_modules/fresh.py +35 -21
euroeval/benchmark_modules/hf.py +280 -244
euroeval/benchmark_modules/litellm.py +752 -312
euroeval/benchmark_modules/vllm.py +570 -268
euroeval/benchmarker.py +651 -528
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +49 -38
euroeval/constants.py +44 -25
euroeval/data_loading.py +111 -55
euroeval/data_models.py +490 -323
euroeval/dataset_configs/__init__.py +26 -4
euroeval/dataset_configs/bosnian.py +39 -0
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/croatian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +78 -50
euroeval/dataset_configs/dutch.py +74 -44
euroeval/dataset_configs/english.py +71 -36
euroeval/dataset_configs/estonian.py +111 -0
euroeval/dataset_configs/faroese.py +25 -18
euroeval/dataset_configs/finnish.py +63 -26
euroeval/dataset_configs/french.py +65 -32
euroeval/dataset_configs/german.py +77 -36
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +68 -57
euroeval/dataset_configs/italian.py +68 -36
euroeval/dataset_configs/latvian.py +87 -0
euroeval/dataset_configs/lithuanian.py +64 -0
euroeval/dataset_configs/norwegian.py +98 -72
euroeval/dataset_configs/polish.py +96 -0
euroeval/dataset_configs/portuguese.py +63 -40
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/slovene.py +56 -0
euroeval/dataset_configs/spanish.py +68 -34
euroeval/dataset_configs/swedish.py +82 -41
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/enums.py +12 -6
euroeval/exceptions.py +21 -1
euroeval/finetuning.py +34 -26
euroeval/generation.py +76 -41
euroeval/generation_utils.py +169 -34
euroeval/languages.py +1020 -188
euroeval/logging_utils.py +268 -0
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +85 -0
euroeval/metrics/huggingface.py +216 -0
euroeval/metrics/llm_as_a_judge.py +260 -0
euroeval/metrics/pipeline.py +289 -0
euroeval/metrics/speed.py +48 -0
euroeval/model_cache.py +40 -21
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +157 -22
euroeval/prompt_templates/multiple_choice.py +159 -17
euroeval/prompt_templates/named_entity_recognition.py +318 -21
euroeval/prompt_templates/reading_comprehension.py +207 -16
euroeval/prompt_templates/sentiment_classification.py +205 -22
euroeval/prompt_templates/summarization.py +122 -22
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +20 -9
euroeval/speed_benchmark.py +11 -12
euroeval/task_group_utils/multiple_choice_classification.py +21 -12
euroeval/task_group_utils/question_answering.py +101 -73
euroeval/task_group_utils/sequence_classification.py +144 -61
euroeval/task_group_utils/text_to_text.py +33 -12
euroeval/task_group_utils/token_classification.py +86 -89
euroeval/tasks.py +75 -16
euroeval/tokenisation_utils.py +603 -0
euroeval/types.py +17 -11
euroeval/utils.py +332 -137
euroeval-16.7.1.dist-info/METADATA +623 -0
euroeval-16.7.1.dist-info/RECORD +84 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -737
euroeval/metrics.py +0 -452
euroeval/tokenization_utils.py +0 -498
euroeval-15.12.0.dist-info/METADATA +0 -285
euroeval-15.12.0.dist-info/RECORD +0 -63
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/caching_utils.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Caching utility functions."""
+import typing as t
+from functools import wraps
+from .constants import T
+def cache_arguments(
+    *arguments: str, disable_condition: t.Callable[[], bool] = lambda: False
+) -> t.Callable[[t.Callable[..., T]], t.Callable[..., T]]:
+    """Cache specified arguments of a function.
+    Args:
+        arguments:
+            The list of argument names to cache. If empty, all arguments are cached.
+        disable_condition:
+            A function that checks if cache should be disabled.
+    Returns:
+        A decorator that caches the specified arguments of a function.
+    """
+    def caching_decorator(func: t.Callable[..., T]) -> t.Callable[..., T]:
+        """Decorator that caches the specified arguments of a function.
+        Args:
+            func:
+                The function to decorate.
+        Returns:
+            The decorated function.
+        """
+        cache: dict[tuple, T] = dict()
+        @wraps(func)
+        def wrapper(*args, **kwargs) -> T:
+            """Wrapper function that caches the specified arguments.
+            Args:
+                *args:
+                    The positional arguments to the function.
+                **kwargs:
+                    The keyword arguments to the function.
+            Returns:
+                The result of the function.
+            Raises:
+                ValueError:
+                    If an argument name is not found in the function parameters.
+            """
+            if not arguments:
+                key = args + tuple(kwargs[k] for k in sorted(kwargs.keys()))
+            else:
+                func_params = func.__code__.co_varnames
+                key_items: list[t.Any] = list()
+                for arg_name in arguments:
+                    if arg_name in kwargs:
+                        key_items.append(kwargs[arg_name])
+                    else:
+                        try:
+                            arg_index = func_params.index(arg_name)
+                            key_items.append(args[arg_index])
+                        except (ValueError, IndexError):
+                            raise ValueError(
+                                f"Argument {arg_name} not found in function "
+                                f"{func.__name__} parameters."
+                            )
+                key = tuple(key_items)
+            # Do not cache if the condition is met
+            if key not in cache or disable_condition():
+                cache[key] = func(*args, **kwargs)
+            return cache[key]
+        return wrapper
+    return caching_decorator

euroeval/callbacks.py CHANGED Viewed

@@ -7,6 +7,8 @@ from collections.abc import Sized
 from tqdm.auto import tqdm
 from transformers.trainer_callback import ProgressCallback
+from .logging_utils import get_pbar
 if t.TYPE_CHECKING:
     from torch.utils.data import DataLoader
     from transformers.trainer_callback import TrainerControl, TrainerState
@@ -32,11 +34,8 @@ class NeverLeaveProgressCallback(ProgressCallback):
         """Callback actions when training begins."""
         if state.is_local_process_zero:
             desc = "Finetuning model"
-            self.training_bar = tqdm(
-                total=None,
-                leave=False,
-                desc=desc,
-                disable=hasattr(sys, "_called_from_test"),
+            self.training_bar = get_pbar(
+                total=None, desc=desc, disable=hasattr(sys, "_called_from_test")
             )
         self.current_step = 0
@@ -67,9 +66,8 @@ class NeverLeaveProgressCallback(ProgressCallback):
         if state.is_local_process_zero and correct_dtype:
             if self.prediction_bar is None:
                 desc = "Evaluating model"
-                self.prediction_bar = tqdm(
+                self.prediction_bar = get_pbar(
                     total=len(eval_dataloader),
-                    leave=False,
                     desc=desc,
                     disable=hasattr(sys, "_called_from_test"),
                 )

euroeval/cli.py CHANGED Viewed

@@ -3,10 +3,9 @@
 import click
 from .benchmarker import Benchmarker
-from .dataset_configs import get_all_dataset_configs
-from .enums import Device
+from .data_models import DatasetConfig
+from .enums import Device, GenerativeType
 from .languages import get_all_languages
-from .tasks import get_all_tasks
 @click.command()
@@ -23,7 +22,6 @@ from .tasks import get_all_tasks
     default=None,
     show_default=True,
     multiple=True,
-    type=click.Choice(list(get_all_tasks().keys())),
     help="The dataset tasks to benchmark the model(s) on.",
 )
 @click.option(
@@ -45,8 +43,7 @@ from .tasks import get_all_tasks
     multiple=True,
     metavar="ISO 639-1 LANGUAGE CODE",
     type=click.Choice(["all"] + list(get_all_languages().keys())),
-    help="""The model languages to benchmark. If not specified then this will use the
-    `language` value.""",
+    help="""This option is deprecated - please use --language instead.""",
 )
 @click.option(
     "--dataset-language",
@@ -56,24 +53,28 @@ from .tasks import get_all_tasks
     multiple=True,
     metavar="ISO 639-1 LANGUAGE CODE",
     type=click.Choice(["all"] + list(get_all_languages().keys())),
-    help="""The dataset languages to benchmark. If "all" then the models will be
-    benchmarked on all datasets. If not specified then this will use the `language`
-    value.""",
+    help="""This option is deprecated - please use --language instead.""",
 )
 @click.option(
     "--dataset",
     default=None,
     show_default=True,
     multiple=True,
-    type=click.Choice(list(get_all_dataset_configs().keys())),
     help="""The name of the benchmark dataset. We recommend to use the `task` and
     `language` options instead of this option.""",
 )
 @click.option(
     "--batch-size",
+    default=None,
+    type=click.Choice(["1", "2", "4", "8", "16", "32"]),
+    help="This option is deprecated - please use --finetuning-batch-size instead.",
+    deprecated=True,
+)
+@click.option(
+    "--finetuning-batch-size",
     default="32",
     type=click.Choice(["1", "2", "4", "8", "16", "32"]),
-    help="The batch size to use.",
+    help="The batch size to use for finetuning.",
 )
 @click.option(
     "--progress-bar/--no-progress-bar",
@@ -188,7 +189,7 @@ from .tasks import get_all_tasks
 )
 @click.option(
     "--gpu-memory-utilization",
-    default=0.9,
+    default=0.8,
     show_default=True,
     help="The GPU memory utilization to use for vLLM. A larger value will result in "
     "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
@@ -203,20 +204,35 @@ from .tasks import get_all_tasks
     "relevant if the model is generative.",
 )
 @click.option(
-    "--only-allow-safetensors",
+    "--requires-safetensors",
     is_flag=True,
     help="Only allow loading models that have safetensors weights available",
     default=False,
 )
+@click.option(
+    "--generative-type",
+    type=click.Choice(["base", "instruction_tuned", "reasoning"]),
+    default=None,
+    show_default=True,
+    help="The type of generative model. Only relevant if the model is generative. If "
+    "not specified, the type will be inferred automatically.",
+)
+@click.option(
+    "--download-only",
+    is_flag=True,
+    help="Only download the requested model weights and datasets, and exit.",
+    default=False,
+)
 def benchmark(
     model: tuple[str],
-    dataset: tuple[str],
+    dataset: tuple[str | DatasetConfig],
     language: tuple[str],
     model_language: tuple[str],
     dataset_language: tuple[str],
     raise_errors: bool,
     task: tuple[str],
-    batch_size: str,
+    batch_size: str | None,
+    finetuning_batch_size: str,
     progress_bar: bool,
     save_results: bool,
     cache_dir: str,
@@ -233,25 +249,16 @@ def benchmark(
     api_version: str | None,
     gpu_memory_utilization: float,
     debug: bool,
-    only_allow_safetensors: bool,
+    requires_safetensors: bool,
+    generative_type: str | None,
+    download_only: bool,
 ) -> None:
     """Benchmark pretrained language models on language tasks."""
-    models = list(model)
-    datasets = None if len(dataset) == 0 else list(dataset)
-    languages: list[str] = list(language)
-    model_languages = None if len(model_language) == 0 else list(model_language)
-    dataset_languages = None if len(dataset_language) == 0 else list(dataset_language)
-    tasks = None if len(task) == 0 else list(task)
-    batch_size_int = int(batch_size)
-    device = Device[device.upper()] if device is not None else None
-    benchmarker = Benchmarker(
-        language=languages,
-        model_language=model_languages,
-        dataset_language=dataset_languages,
-        task=tasks,
-        dataset=datasets,
-        batch_size=batch_size_int,
+    Benchmarker(
+        language=list(language),
+        task=None if len(task) == 0 else list(task),
+        dataset=None if len(dataset) == 0 else list(dataset),
+        finetuning_batch_size=int(finetuning_batch_size),
         progress_bar=progress_bar,
         save_results=save_results,
         raise_errors=raise_errors,
@@ -259,7 +266,7 @@ def benchmark(
         api_key=api_key,
         force=force,
         cache_dir=cache_dir,
-        device=device,
+        device=Device[device.upper()] if device is not None else None,
         trust_remote_code=trust_remote_code,
         clear_model_cache=clear_model_cache,
         evaluate_test_split=evaluate_test_split,
@@ -268,13 +275,17 @@ def benchmark(
         api_base=api_base,
         api_version=api_version,
         gpu_memory_utilization=gpu_memory_utilization,
+        generative_type=GenerativeType[generative_type.upper()]
+        if generative_type
+        else None,
         debug=debug,
         run_with_cli=True,
-        only_allow_safetensors=only_allow_safetensors,
-    )
-    # Perform the benchmark evaluation
-    benchmarker.benchmark(model=models)
+        requires_safetensors=requires_safetensors,
+        download_only=download_only,
+        model_language=None if len(model_language) == 0 else list(model_language),
+        dataset_language=None if len(dataset_language) == 0 else list(dataset_language),
+        batch_size=int(batch_size) if batch_size is not None else None,
+    ).benchmark(model=list(model))
 if __name__ == "__main__":

euroeval/constants.py CHANGED Viewed

@@ -1,23 +1,25 @@
 """Constants used throughout the project."""
+import re
+from typing import TypeVar
 from .enums import TaskGroup
-from .tasks import NER
+# Type variable used for generic typing
+T = TypeVar("T", bound=object)
 # This is used as input to generative models; it cannot be a special token
 DUMMY_FILL_VALUE = 100
 # This is the maximum allowed context length for models for the purpose of this
 # benchmark. We will still report the models' true maximum context length in the
 # metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
 # all tokens in the context.
-MAX_CONTEXT_LENGTH = 5_000
+MAX_CONTEXT_LENGTH = 8_192
 # We need to raise the amount of tokens generated for reasoning models, to give them
 # time to think
-REASONING_MAX_TOKENS = 32_768
+REASONING_MAX_TOKENS = 8_192
 # The Hugging Face Hub pipeline tags used to classify models as generative
 GENERATIVE_PIPELINE_TAGS = [
@@ -28,48 +30,49 @@ GENERATIVE_PIPELINE_TAGS = [
     "video-text-to-text",
 ]
 # Used to disallow non-generative models to be evaluated on these task groups
 GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
 # Local models are required to have these files in their directory
 LOCAL_MODELS_REQUIRED_FILES = ["config.json"]
-# Tasks where we use structured generation for generative models
-TASKS_USING_JSON = [NER]
-# Tasks where we use log probabilities for generative models, rather than the raw
-# completion
-TASK_GROUPS_USING_LOGPROBS = [
-    TaskGroup.SEQUENCE_CLASSIFICATION,
-    TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
-]
 # The number of top log probabilities to return for generative models. For several APIs
 # this is the maximum number of log probabilities that can be returned
-MAX_LOGPROBS = 8
+MAX_VLLM_LOGPROBS = 20
+MAX_LITELLM_LOGPROBS = 8
 # We make sure to remove these metric attributes after each iteration, to avoid memory
 # leaks
 METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
 # Hugging Face Hub tags used to classify models as merge models
 MERGE_TAGS = ["merge", "mergekit"]
 # The minimum required CUDA compute capability for using bfloat16 in vLLM
 VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
+# The candidates for end-of-sequence, beginning-of-sequence and padding tokens
+EOS_TOKENS = ["</s>", "<|end_of_text|>", "<|endoftext|>", "[SEP]", "<|return|>"]
+BOS_TOKENS = ["<s>", "<|begin_of_text|>", "<|startoftext|>", "[CLS]"]
+PAD_TOKENS = [
+    "<pad>",
+    "<PAD>",
+    "[pad]",
+    "[PAD]",
+    "<|endoftext|>",
+    "<｜end▁of▁sentence｜>",
+    "<|im_end|>",
+]
 # Used to detect whether a model is a reasoning model
-REASONING_TOKENS = [
+REASONING_TOKENS: list[tuple[str | re.Pattern, str | re.Pattern]] = [
     ("<think>", "</think>"),
     ("<reason>", "</reason>"),
     ("<reasoning>", "</reasoning>"),
+    (
+        re.compile(pattern=r"<\|channel\|>(analysis|commentary)<\|message\|>"),
+        "<|channel|>final<|message|>",
+    ),
 ]
 # These tokens are sometimes used by models to indicate the end of a generated
@@ -77,3 +80,19 @@ REASONING_TOKENS = [
 # manually. We only use them as stop tokens if they actually appear in the model's
 # output
 CUSTOM_STOP_TOKENS = ["<sep>"]
+# For classification tasks we force LiteLLM models to output a JSON dictionary with a
+# single key and the values being restricted to the allowed labels. This is the key we
+# use
+LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
+# These characters are stripped from JSON output when trying to identify the label
+JSON_STRIP_CHARACTERS = ' {}\n\r":'
+# The number of tokens we generate when evaluating generative models on classification
+# tasks. We also use this to determine whether we should store logprobs in the model
+# outputs (and cache).
+NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
+# We only allow loading local datasets in these file formats
+SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS = ["csv"]

euroeval/data_loading.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Functions related to the loading of the data."""
+import collections.abc as c
 import logging
 import sys
 import time
@@ -11,7 +12,10 @@ from datasets.exceptions import DatasetsError
 from huggingface_hub.errors import HfHubHTTPError
 from numpy.random import Generator
+from .constants import SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark
+from .logging_utils import log, no_terminal_output
+from .tasks import EUROPEAN_VALUES
 from .utils import unscramble
 if t.TYPE_CHECKING:
@@ -19,8 +23,6 @@ if t.TYPE_CHECKING:
     from .data_models import BenchmarkConfig, DatasetConfig
-logger = logging.getLogger("euroeval")
 def load_data(
     rng: Generator, dataset_config: "DatasetConfig", benchmark_config: "BenchmarkConfig"
@@ -48,40 +50,45 @@ def load_data(
         dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
     )
-    if not benchmark_config.evaluate_test_split:
+    if not benchmark_config.evaluate_test_split and "val" in dataset:
         dataset["test"] = dataset["val"]
     # Remove empty examples from the datasets
     for text_feature in ["tokens", "text"]:
-        if text_feature in dataset["train"].features:
-            dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
+        for split in dataset_config.splits:
+            if text_feature in dataset[split].features:
+                dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
-    # If we are testing then truncate the test set
-    if hasattr(sys, "_called_from_test"):
+    # If we are testing then truncate the test set, unless we need the full set for
+    # evaluation
+    if hasattr(sys, "_called_from_test") and dataset_config.task != EUROPEAN_VALUES:
         dataset["test"] = dataset["test"].select(range(1))
-    # Bootstrap the splits
-    bootstrapped_splits: dict[str, list["Dataset"]] = dict()
-    for split in ["train", "val", "test"]:
-        bootstrap_indices = rng.integers(
-            0,
-            len(dataset[split]),
-            size=(benchmark_config.num_iterations, len(dataset[split])),
-        )
-        bootstrapped_splits[split] = [
-            dataset[split].select(bootstrap_indices[idx])
+    # Bootstrap the splits, if applicable
+    if dataset_config.bootstrap_samples:
+        bootstrapped_splits: dict[str, c.Sequence["Dataset"]] = dict()
+        for split in dataset_config.splits:
+            bootstrap_indices = rng.integers(
+                0,
+                len(dataset[split]),
+                size=(benchmark_config.num_iterations, len(dataset[split])),
+            )
+            bootstrapped_splits[split] = [
+                dataset[split].select(bootstrap_indices[idx])
+                for idx in range(benchmark_config.num_iterations)
+            ]
+        datasets = [
+            DatasetDict(
+                {
+                    split: bootstrapped_splits[split][idx]
+                    for split in dataset_config.splits
+                }
+            )
             for idx in range(benchmark_config.num_iterations)
         ]
+    else:
+        datasets = [dataset] * benchmark_config.num_iterations
-    datasets = [
-        DatasetDict(
-            {
-                split: bootstrapped_splits[split][idx]
-                for split in ["train", "val", "test"]
-            }
-        )
-        for idx in range(benchmark_config.num_iterations)
-    ]
     return datasets
@@ -97,40 +104,89 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
     Returns:
         The dataset.
     """
-    num_attempts = 5
-    for _ in range(num_attempts):
-        try:
-            dataset = load_dataset(
-                path=dataset_config.huggingface_id,
-                cache_dir=cache_dir,
-                token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
+    # Case where the dataset source is a Hugging Face ID
+    if isinstance(dataset_config.source, str):
+        num_attempts = 5
+        for _ in range(num_attempts):
+            try:
+                with no_terminal_output():
+                    dataset = load_dataset(
+                        path=dataset_config.source.split("::")[0],
+                        name=(
+                            dataset_config.source.split("::")[1]
+                            if "::" in dataset_config.source
+                            else None
+                        ),
+                        cache_dir=cache_dir,
+                        token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_"),
+                    )
+                break
+            except (
+                FileNotFoundError,
+                ConnectionError,
+                DatasetsError,
+                requests.ConnectionError,
+                requests.ReadTimeout,
+            ) as e:
+                log(
+                    f"Failed to load dataset {dataset_config.source!r}, due to "
+                    f"the following error: {e}. Retrying...",
+                    level=logging.DEBUG,
+                )
+                time.sleep(1)
+                continue
+            except HfHubHTTPError:
+                raise HuggingFaceHubDown()
+        else:
+            raise InvalidBenchmark(
+                f"Failed to load dataset {dataset_config.source!r} after "
+                f"{num_attempts} attempts. Run with verbose mode to see the individual "
+                "errors."
             )
-            break
-        except (
-            FileNotFoundError,
-            ConnectionError,
-            DatasetsError,
-            requests.ConnectionError,
-            requests.ReadTimeout,
-        ):
-            logger.warning(
-                f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
-            )
-            time.sleep(1)
-            continue
-        except HfHubHTTPError:
-            raise HuggingFaceHubDown()
+    # Case where the dataset source is a dictionary with keys "train", "val" and "test",
+    # with the values pointing to local CSV files
     else:
-        raise InvalidBenchmark(
-            f"Failed to load dataset {dataset_config.huggingface_id!r} after "
-            f"{num_attempts} attempts."
-        )
+        data_files = {
+            split: dataset_config.source[split]
+            for split in dataset_config.splits
+            if split in dataset_config.source
+        }
+        # Get the file extension and ensure that all files have the same extension
+        file_extensions = {
+            split: dataset_config.source[split].split(".")[-1]
+            for split in dataset_config.splits
+            if split in dataset_config.source
+        }
+        if len(set(file_extensions.values())) != 1:
+            raise InvalidBenchmark(
+                "All data files in a custom dataset must have the same file extension. "
+                f"Got the extensions {', '.join(file_extensions.values())} for the "
+                f"dataset {dataset_config.name!r}."
+            )
+        file_extension = list(file_extensions.values())[0]
+        # Check that the file extension is supported
+        if file_extension not in SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS:
+            raise InvalidBenchmark(
+                "Unsupported file extension for custom dataset. Supported file "
+                "extensions are "
+                f"{', '.join(SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS)}, but got "
+                f"{file_extension!r}."
+            )
+        # Load the dataset
+        with no_terminal_output():
+            dataset = load_dataset(
+                path=file_extension, data_files=data_files, cache_dir=cache_dir
+            )
     assert isinstance(dataset, DatasetDict)  # type: ignore[used-before-def]
-    required_keys = ["train", "val", "test"]
-    missing_keys = [key for key in required_keys if key not in dataset]
+    missing_keys = [key for key in dataset_config.splits if key not in dataset]
     if missing_keys:
         raise InvalidBenchmark(
             "The dataset is missing the following required splits: "
             f"{', '.join(missing_keys)}"
         )
-    return DatasetDict({key: dataset[key] for key in required_keys})
+    return DatasetDict({key: dataset[key] for key in dataset_config.splits})

EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl