PyPI - ScandEval - Versions diffs - 16.12.0__py3-none-any.whl → 16.13.0__py3-none-any.whl - Mend

ScandEval 16.12.0py3-none-any.whl → 16.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

scandeval/async_utils.py +46 -0
scandeval/benchmark_config_factory.py +26 -2
scandeval/benchmark_modules/fresh.py +2 -1
scandeval/benchmark_modules/hf.py +50 -12
scandeval/benchmark_modules/litellm.py +25 -15
scandeval/benchmark_modules/vllm.py +3 -3
scandeval/benchmarker.py +15 -33
scandeval/cli.py +2 -4
scandeval/constants.py +5 -0
scandeval/custom_dataset_configs.py +152 -0
scandeval/data_loading.py +87 -31
scandeval/data_models.py +396 -225
scandeval/dataset_configs/__init__.py +51 -25
scandeval/dataset_configs/albanian.py +1 -1
scandeval/dataset_configs/belarusian.py +47 -0
scandeval/dataset_configs/bulgarian.py +1 -1
scandeval/dataset_configs/catalan.py +1 -1
scandeval/dataset_configs/croatian.py +1 -1
scandeval/dataset_configs/danish.py +3 -2
scandeval/dataset_configs/dutch.py +7 -6
scandeval/dataset_configs/english.py +4 -3
scandeval/dataset_configs/estonian.py +8 -7
scandeval/dataset_configs/faroese.py +1 -1
scandeval/dataset_configs/finnish.py +5 -4
scandeval/dataset_configs/french.py +6 -5
scandeval/dataset_configs/german.py +4 -3
scandeval/dataset_configs/greek.py +1 -1
scandeval/dataset_configs/hungarian.py +1 -1
scandeval/dataset_configs/icelandic.py +4 -3
scandeval/dataset_configs/italian.py +4 -3
scandeval/dataset_configs/latvian.py +2 -2
scandeval/dataset_configs/lithuanian.py +1 -1
scandeval/dataset_configs/norwegian.py +6 -5
scandeval/dataset_configs/polish.py +4 -3
scandeval/dataset_configs/portuguese.py +5 -4
scandeval/dataset_configs/romanian.py +2 -2
scandeval/dataset_configs/serbian.py +1 -1
scandeval/dataset_configs/slovene.py +1 -1
scandeval/dataset_configs/spanish.py +4 -3
scandeval/dataset_configs/swedish.py +4 -3
scandeval/dataset_configs/ukrainian.py +1 -1
scandeval/generation_utils.py +6 -6
scandeval/metrics/llm_as_a_judge.py +1 -1
scandeval/metrics/pipeline.py +1 -1
scandeval/model_cache.py +34 -4
scandeval/prompt_templates/linguistic_acceptability.py +9 -0
scandeval/prompt_templates/multiple_choice.py +9 -0
scandeval/prompt_templates/named_entity_recognition.py +21 -0
scandeval/prompt_templates/reading_comprehension.py +10 -0
scandeval/prompt_templates/sentiment_classification.py +11 -0
scandeval/string_utils.py +157 -0
scandeval/task_group_utils/sequence_classification.py +2 -5
scandeval/task_group_utils/token_classification.py +2 -4
scandeval/utils.py +6 -323
scandeval-16.13.0.dist-info/METADATA +334 -0
scandeval-16.13.0.dist-info/RECORD +94 -0
scandeval-16.12.0.dist-info/METADATA +0 -667
scandeval-16.12.0.dist-info/RECORD +0 -90
{scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
{scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
{scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0

scandeval/async_utils.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Utility functions for asyncronous tasks."""
+import asyncio
+import typing as t
+from .constants import T
+def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
+    """Run a coroutine, ensuring that the event loop is always closed when we're done.
+    Args:
+        coroutine:
+            The coroutine to run.
+    Returns:
+        The result of the coroutine.
+    """
+    try:
+        loop = asyncio.get_event_loop()
+    except RuntimeError:  # If the current event loop is closed
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+    response = loop.run_until_complete(coroutine)
+    return response
+async def add_semaphore_and_catch_exception(
+    coroutine: t.Coroutine[t.Any, t.Any, T], semaphore: asyncio.Semaphore
+) -> T | Exception:
+    """Run a coroutine with a semaphore.
+    Args:
+        coroutine:
+            The coroutine to run.
+        semaphore:
+            The semaphore to use.
+    Returns:
+        The result of the coroutine.
+    """
+    async with semaphore:
+        try:
+            return await coroutine
+        except Exception as exc:
+            return exc

scandeval/benchmark_config_factory.py CHANGED Viewed

@@ -46,6 +46,8 @@ def build_benchmark_config(
         dataset=benchmark_config_params.dataset,
         languages=languages,
         custom_datasets_file=benchmark_config_params.custom_datasets_file,
+        api_key=benchmark_config_params.api_key,
+        cache_dir=Path(benchmark_config_params.cache_dir),
     )
     return BenchmarkConfig(
@@ -159,7 +161,9 @@ def prepare_dataset_configs(
     languages: c.Sequence["Language"],
     dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None",
     custom_datasets_file: Path,
-) -> c.Sequence["DatasetConfig"]:
+    api_key: str | None,
+    cache_dir: Path,
+) -> list["DatasetConfig"]:
     """Prepare dataset config(s) for benchmarking.
     Args:
@@ -173,6 +177,10 @@ def prepare_dataset_configs(
             included, limited by the `task` and `languages` parameters.
         custom_datasets_file:
             A path to a Python file containing custom dataset configurations.
+        api_key:
+            The API key to use for accessing the Hugging Face Hub.
+        cache_dir:
+            The directory to store the cache in.
     Returns:
         The prepared dataset configs.
@@ -181,9 +189,25 @@ def prepare_dataset_configs(
         InvalidBenchmark:
             If the task or dataset is not found in the benchmark tasks or datasets.
     """
+    # Extract the dataset IDs from the `dataset` argument
+    dataset_ids: list[str] = list()
+    if isinstance(dataset, str):
+        dataset_ids.append(dataset)
+    elif isinstance(dataset, DatasetConfig):
+        dataset_ids.append(dataset.name)
+    elif isinstance(dataset, list):
+        for d in dataset:
+            if isinstance(d, str):
+                dataset_ids.append(d)
+            elif isinstance(d, DatasetConfig):
+                dataset_ids.append(d.name)
     # Create the list of dataset configs
     all_dataset_configs = get_all_dataset_configs(
-        custom_datasets_file=custom_datasets_file
+        custom_datasets_file=custom_datasets_file,
+        dataset_ids=dataset_ids,
+        api_key=api_key,
+        cache_dir=cache_dir,
     )
     all_official_dataset_configs: c.Sequence[DatasetConfig] = [
         dataset_config

scandeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -28,8 +28,9 @@ from ..exceptions import (
 )
 from ..generation_utils import raise_if_wrong_params
 from ..logging_utils import block_terminal_output
+from ..model_cache import create_model_cache_dir
 from ..types import Tokeniser
-from ..utils import create_model_cache_dir, get_hf_token
+from ..utils import get_hf_token
 from .hf import (
     HuggingFaceEncoderModel,
     align_model_and_tokeniser,

scandeval/benchmark_modules/hf.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Encoder models from the Hugging Face Hub."""
 import collections.abc as c
+import importlib
 import logging
 import re
 import typing as t
@@ -63,6 +64,8 @@ from ..exceptions import (
 from ..generation_utils import raise_if_wrong_params
 from ..languages import get_all_languages
 from ..logging_utils import block_terminal_output, log, log_once
+from ..model_cache import create_model_cache_dir
+from ..string_utils import split_model_id
 from ..task_group_utils import (
     multiple_choice_classification,
     question_answering,
@@ -70,13 +73,7 @@ from ..task_group_utils import (
 )
 from ..tokenisation_utils import get_bos_token, get_eos_token
 from ..types import Tokeniser
-from ..utils import (
-    create_model_cache_dir,
-    get_class_by_name,
-    get_hf_token,
-    internet_connection_available,
-    split_model_id,
-)
+from ..utils import get_hf_token, internet_connection_available
 from .base import BenchmarkModule
 try:
@@ -381,7 +378,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             if "label" in examples:
                 try:
                     examples["label"] = [
-                        self._model.config.label2id[lbl.lower()]
+                        self._model.config.label2id[str(lbl).lower()]
                         if self._model.config.label2id is not None
                         else lbl
                         for lbl in examples["label"]
@@ -817,8 +814,8 @@ def get_model_repo_info(
                     log(
                         f"Could not access the model {model_id} with the revision "
                         f"{revision}. The error was {str(e)!r}. Please set the "
-                        "`HUGGINGFACE_API_KEY` environment variable or use the "
-                        "`--api-key` argument.",
+                        "`HUGGINGFACE_API_KEY` or `HF_TOKEN` environment variable or "
+                        "use the `--api-key` argument.",
                         level=logging.DEBUG,
                     )
                     return None
@@ -1095,8 +1092,8 @@ def load_hf_model_config(
                     f"The model {model_id!r} is a gated repository. Please ensure "
                     "that you are logged in with `hf auth login` or have provided a "
                     "valid Hugging Face access token with the `HUGGINGFACE_API_KEY` "
-                    "environment variable or the `--api-key` argument. Also check that "
-                    "your account has access to this model."
+                    "or `HF_TOKEN` environment variable or the `--api-key` argument. "
+                    "Also check that your account has access to this model."
                 ) from e
             raise InvalidModel(
                 f"Couldn't load model config for {model_id!r}. The error was "
@@ -1334,3 +1331,44 @@ def task_group_to_class_name(task_group: TaskGroup) -> str:
     )
     pascal_case = special_case_mapping.get(pascal_case, pascal_case)
     return f"AutoModelFor{pascal_case}"
+def get_class_by_name(
+    class_name: str | c.Sequence[str], module_name: str
+) -> t.Type | None:
+    """Get a class by its name.
+    Args:
+        class_name:
+            The name of the class, written in kebab-case. The corresponding class name
+            must be the same, but written in PascalCase, and lying in a module with the
+            same name, but written in snake_case. If a list of strings is passed, the
+            first class that is found is returned.
+        module_name:
+            The name of the module where the class is located.
+    Returns:
+        The class. If the class is not found, None is returned.
+    """
+    if isinstance(class_name, str):
+        class_name = [class_name]
+    error_messages = list()
+    for name in class_name:
+        try:
+            module = importlib.import_module(name=module_name)
+            class_: t.Type = getattr(module, name)
+            return class_
+        except (ModuleNotFoundError, AttributeError) as e:
+            error_messages.append(str(e))
+    if error_messages:
+        errors = "\n- " + "\n- ".join(error_messages)
+        log(
+            f"Could not find the class with the name(s) {', '.join(class_name)}. The "
+            f"following error messages were raised: {errors}",
+            level=logging.DEBUG,
+        )
+    # If the class could not be found, return None
+    return None

scandeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -40,7 +40,7 @@ from pydantic import ValidationError, conlist, create_model
 from requests.exceptions import RequestException
 from tqdm.asyncio import tqdm as tqdm_async
-from ..caching_utils import cache_arguments
+from ..async_utils import add_semaphore_and_catch_exception, safe_run
 from ..constants import (
     JSON_STRIP_CHARACTERS,
     LITELLM_CLASSIFICATION_OUTPUT_KEY,
@@ -74,6 +74,8 @@ from ..generation_utils import (
     raise_if_wrong_params,
 )
 from ..logging_utils import get_pbar, log, log_once
+from ..model_cache import create_model_cache_dir
+from ..string_utils import split_model_id
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
@@ -83,13 +85,7 @@ from ..task_group_utils import (
 from ..tasks import NER
 from ..tokenisation_utils import get_first_label_token_mapping
 from ..types import ExtractLabelsFunction
-from ..utils import (
-    add_semaphore_and_catch_exception,
-    create_model_cache_dir,
-    get_hf_token,
-    safe_run,
-    split_model_id,
-)
+from ..utils import get_hf_token
 from .base import BenchmarkModule
 from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokeniser
@@ -700,10 +696,10 @@ class LiteLLMModel(BenchmarkModule):
         elif isinstance(
             error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
         ):
-            log_once(
-                f"Service temporarily unavailable. The error message was: {error}. "
-                "Retrying in 10 seconds...",
-                level=logging.DEBUG,
+            log(
+                "Service temporarily unavailable during generation. The error "
+                f"message was: {error}. Retrying in 10 seconds...",
+                level=logging.INFO,
             )
             return generation_kwargs, 10
         elif isinstance(error, UnsupportedParamsError):
@@ -764,6 +760,20 @@ class LiteLLMModel(BenchmarkModule):
                 run_with_cli=self.benchmark_config.run_with_cli,
             ) from error
+        if (
+            isinstance(error, (BadRequestError, NotFoundError))
+            and self.benchmark_config.api_base is not None
+            and not self.benchmark_config.api_base.endswith("/v1")
+        ):
+            log_once(
+                f"The API base {self.benchmark_config.api_base!r} is not valid. We "
+                "will try appending '/v1' to it and try again.",
+                level=logging.DEBUG,
+            )
+            self.benchmark_config.api_base += "/v1"
+            generation_kwargs["api_base"] = self.benchmark_config.api_base
+            return generation_kwargs, 0
         raise InvalidBenchmark(
             f"Failed to generate text. The error message was: {error}"
         ) from error
@@ -1390,9 +1400,10 @@ class LiteLLMModel(BenchmarkModule):
                 InternalServerError,
             ) as e:
                 log(
-                    f"Service temporarily unavailable. The error message was: {e}. "
+                    "Service temporarily unavailable while checking for model "
+                    f"existence of the model {model_id!r}. The error message was: {e}. "
                     "Retrying in 10 seconds...",
-                    level=logging.DEBUG,
+                    level=logging.INFO,
                 )
                 sleep(10)
             except APIError as e:
@@ -1567,7 +1578,6 @@ class LiteLLMModel(BenchmarkModule):
         return dataset
-    @cache_arguments()
     def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
         """Get the generation arguments for the model.

scandeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -54,6 +54,8 @@ from ..generation_utils import (
 )
 from ..languages import get_all_languages
 from ..logging_utils import get_pbar, log, log_once, no_terminal_output
+from ..model_cache import create_model_cache_dir
+from ..string_utils import split_model_id
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
@@ -73,12 +75,10 @@ from ..tokenisation_utils import (
 from ..types import ExtractLabelsFunction, Tokeniser
 from ..utils import (
     clear_memory,
-    create_model_cache_dir,
     get_hf_token,
     get_min_cuda_compute_capability,
     internet_connection_available,
     resolve_model_path,
-    split_model_id,
 )
 from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
@@ -1144,7 +1144,7 @@ def load_model_and_tokeniser(
             pipeline_parallel_size=pipeline_parallel_size,
             disable_custom_all_reduce=True,
             quantization=quantization,
-            dtype=dtype,
+            dtype=dtype,  # pyrefly: ignore[bad-argument-type]
             enforce_eager=True,
             # TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
             # so we disable it for now

scandeval/benchmarker.py CHANGED Viewed

@@ -18,7 +18,6 @@ from .benchmark_config_factory import build_benchmark_config
 from .constants import ATTENTION_BACKENDS, GENERATIVE_PIPELINE_TAGS
 from .data_loading import load_data, load_raw_data
 from .data_models import BenchmarkConfigParams, BenchmarkResult
-from .dataset_configs import get_all_dataset_configs
 from .enums import Device, GenerativeType, ModelType
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
 from .finetuning import finetune
@@ -28,12 +27,9 @@ from .model_config import get_model_config
 from .model_loading import load_model
 from .scores import log_scores
 from .speed_benchmark import benchmark_speed
+from .string_utils import split_model_id
 from .tasks import SPEED
-from .utils import (
-    enforce_reproducibility,
-    internet_connection_available,
-    split_model_id,
-)
+from .utils import enforce_reproducibility, internet_connection_available
 if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
@@ -79,7 +75,9 @@ class Benchmarker:
         api_base: str | None = None,
         api_version: str | None = None,
         gpu_memory_utilization: float = 0.8,
-        attention_backend: str = "FLASHINFER",
+        attention_backend: t.Literal[
+            *ATTENTION_BACKENDS  # pyrefly: ignore[invalid-literal]
+        ] = "FLASHINFER",
         generative_type: GenerativeType | None = None,
         custom_datasets_file: Path | str = Path("custom_datasets.py"),
         debug: bool = False,
@@ -346,7 +344,9 @@ class Benchmarker:
             f"Loading data for {dataset_config.logging_string}", level=logging.INFO
         )
         dataset = load_raw_data(
-            dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
+            dataset_config=dataset_config,
+            cache_dir=benchmark_config.cache_dir,
+            api_key=benchmark_config.api_key,
         )
         del dataset
@@ -513,6 +513,11 @@ class Benchmarker:
             ValueError:
                 If both `task` and `dataset` are specified.
         """
+        log(
+            "Started EuroEval run. Run with `--verbose` for more information.",
+            level=logging.INFO,
+        )
         if task is not None and dataset is not None:
             raise ValueError("Only one of `task` and `dataset` can be specified.")
@@ -790,7 +795,7 @@ class Benchmarker:
                 # Update the benchmark config if the dataset requires it
                 if (
-                    "val" not in dataset_config.splits
+                    dataset_config.val_split is None
                     and not benchmark_config.evaluate_test_split
                 ):
                     log(
@@ -1066,7 +1071,7 @@ class Benchmarker:
                     ),
                     validation_split=(
                         None
-                        if "val" not in dataset_config.splits
+                        if dataset_config.val_split is None
                         else not benchmark_config.evaluate_test_split
                     ),
                 )
@@ -1181,29 +1186,6 @@ def clear_model_cache_fn(cache_dir: str) -> None:
                     rmtree(sub_model_dir)
-def prepare_dataset_configs(
-    dataset_names: c.Sequence[str], custom_datasets_file: Path
-) -> c.Sequence["DatasetConfig"]:
-    """Prepare the dataset configuration(s) to be benchmarked.
-    Args:
-        dataset_names:
-            The dataset names to benchmark.
-        custom_datasets_file:
-            A path to a Python file containing custom dataset configurations.
-    Returns:
-        The prepared list of model IDs.
-    """
-    return [
-        cfg
-        for cfg in get_all_dataset_configs(
-            custom_datasets_file=custom_datasets_file
-        ).values()
-        if cfg.name in dataset_names
-    ]
 def initial_logging(
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",

scandeval/cli.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pathlib import Path
 import click
 from .benchmarker import Benchmarker
+from .constants import ATTENTION_BACKENDS
 from .data_models import DatasetConfig
 from .enums import Device, GenerativeType
 from .languages import get_all_languages
@@ -174,10 +175,7 @@ from .languages import get_all_languages
     "--attention-backend",
     default="FLASHINFER",
     show_default=True,
-    type=click.Choice(
-        ["FLASHINFER", "FLASH_ATTN", "TRITON_ATTN", "FLEX_ATTENTION"],
-        case_sensitive=True,
-    ),
+    type=click.Choice(ATTENTION_BACKENDS, case_sensitive=True),
     help="The attention backend to use for vLLM. Only relevant if the model is "
     "generative.",
 )

scandeval/constants.py CHANGED Viewed

@@ -134,3 +134,8 @@ ATTENTION_BACKENDS: list[str] = [
     "CPU_ATTN",
     "CUSTOM",
 ]
+# If a dataset configuration has more than this number of languages, we won't log any of
+# the languages. This is for instance the case for the speed benchmark, which has all
+# the languages. The threshold of 5 is somewhat arbitrary.
+MAX_NUMBER_OF_LOGGING_LANGUAGES = 5

scandeval/custom_dataset_configs.py ADDED Viewed

@@ -0,0 +1,152 @@
+"""Load custom dataset configs."""
+import importlib.util
+import logging
+from pathlib import Path
+from types import ModuleType
+from huggingface_hub import HfApi
+from .data_models import DatasetConfig
+from .logging_utils import log_once
+from .utils import get_hf_token
+def load_custom_datasets_module(custom_datasets_file: Path) -> ModuleType | None:
+    """Load the custom datasets module if it exists.
+    Args:
+        custom_datasets_file:
+            The path to the custom datasets module.
+    Raises:
+        RuntimeError:
+            If the custom datasets module cannot be loaded.
+    """
+    if custom_datasets_file.exists():
+        spec = importlib.util.spec_from_file_location(
+            name="custom_datasets_module", location=str(custom_datasets_file.resolve())
+        )
+        if spec is None:
+            log_once(
+                "Could not load the spec for the custom datasets file from "
+                f"{custom_datasets_file.resolve()}.",
+                level=logging.ERROR,
+            )
+            return None
+        module = importlib.util.module_from_spec(spec=spec)
+        if spec.loader is None:
+            log_once(
+                "Could not load the module for the custom datasets file from "
+                f"{custom_datasets_file.resolve()}.",
+                level=logging.ERROR,
+            )
+            return None
+        spec.loader.exec_module(module)
+        return module
+    return None
+def try_get_dataset_config_from_repo(
+    dataset_id: str, api_key: str | None, cache_dir: Path
+) -> DatasetConfig | None:
+    """Try to get a dataset config from a Hugging Face dataset repository.
+    Args:
+        dataset_id:
+            The ID of the dataset to get the config for.
+        api_key:
+            The Hugging Face API key to use to check if the repositories have custom
+            dataset configs.
+        cache_dir:
+            The directory to store the cache in.
+    Returns:
+        The dataset config if it exists, otherwise None.
+    """
+    # Check if the dataset ID is a Hugging Face dataset ID, abort if it isn't
+    token = get_hf_token(api_key=api_key)
+    hf_api = HfApi(token=token)
+    if not hf_api.repo_exists(repo_id=dataset_id, repo_type="dataset"):
+        return None
+    # Check if the repository has a euroeval_config.py file, abort if it doesn't
+    repo_files = hf_api.list_repo_files(
+        repo_id=dataset_id, repo_type="dataset", revision="main"
+    )
+    if "euroeval_config.py" not in repo_files:
+        log_once(
+            f"Dataset {dataset_id} does not have a euroeval_config.py file, so we "
+            "cannot load it. Skipping.",
+            level=logging.WARNING,
+        )
+        return None
+    # Fetch the euroeval_config.py file, abort if loading failed
+    external_config_path = cache_dir / "external_dataset_configs" / dataset_id
+    external_config_path.mkdir(parents=True, exist_ok=True)
+    hf_api.hf_hub_download(
+        repo_id=dataset_id,
+        repo_type="dataset",
+        filename="euroeval_config.py",
+        local_dir=external_config_path,
+        local_dir_use_symlinks=False,
+    )
+    module = load_custom_datasets_module(
+        custom_datasets_file=external_config_path / "euroeval_config.py"
+    )
+    if module is None:
+        return None
+    # Check that there is exactly one dataset config, abort if there isn't
+    repo_dataset_configs = [
+        cfg for cfg in vars(module).values() if isinstance(cfg, DatasetConfig)
+    ]
+    if not repo_dataset_configs:
+        return None  # Already warned the user in this case, so we just skip
+    elif len(repo_dataset_configs) > 1:
+        log_once(
+            f"Dataset {dataset_id} has multiple dataset configurations. Please ensure "
+            "that only a single DatasetConfig is defined in the `euroeval_config.py` "
+            "file.",
+            level=logging.WARNING,
+        )
+        return None
+    # Get the dataset split names
+    splits = [
+        split["name"]
+        for split in hf_api.dataset_info(repo_id=dataset_id).card_data.dataset_info[
+            "splits"
+        ]
+    ]
+    train_split_candidates = sorted(
+        [split for split in splits if "train" in split.lower()], key=len
+    )
+    val_split_candidates = sorted(
+        [split for split in splits if "val" in split.lower()], key=len
+    )
+    test_split_candidates = sorted(
+        [split for split in splits if "test" in split.lower()], key=len
+    )
+    train_split = train_split_candidates[0] if train_split_candidates else None
+    val_split = val_split_candidates[0] if val_split_candidates else None
+    test_split = test_split_candidates[0] if test_split_candidates else None
+    if test_split is None:
+        log_once(
+            f"Dataset {dataset_id} does not have a test split, so we cannot load it. "
+            "Please ensure that the dataset has a test split.",
+            level=logging.ERROR,
+        )
+        return None
+    # Set up the config with the repo information
+    repo_dataset_config = repo_dataset_configs[0]
+    repo_dataset_config.name = dataset_id
+    repo_dataset_config.pretty_name = dataset_id
+    repo_dataset_config.source = dataset_id
+    repo_dataset_config.train_split = train_split
+    repo_dataset_config.val_split = val_split
+    repo_dataset_config.test_split = test_split
+    return repo_dataset_config

ScandEval 16.12.0__py3-none-any.whl → 16.13.0__py3-none-any.whl

ScandEval 16.12.0py3-none-any.whl → 16.13.0py3-none-any.whl