PyPI - ScandEval - Versions diffs - 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl - Mend

ScandEval 16.11.0py3-none-any.whl → 16.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

scandeval/__init__.py +0 -9
scandeval/async_utils.py +46 -0
scandeval/benchmark_config_factory.py +31 -2
scandeval/benchmark_modules/fresh.py +2 -1
scandeval/benchmark_modules/hf.py +76 -23
scandeval/benchmark_modules/litellm.py +33 -15
scandeval/benchmark_modules/vllm.py +97 -44
scandeval/benchmarker.py +29 -33
scandeval/cli.py +11 -0
scandeval/constants.py +36 -2
scandeval/custom_dataset_configs.py +152 -0
scandeval/data_loading.py +87 -31
scandeval/data_models.py +405 -224
scandeval/dataset_configs/__init__.py +51 -25
scandeval/dataset_configs/albanian.py +1 -1
scandeval/dataset_configs/belarusian.py +47 -0
scandeval/dataset_configs/bulgarian.py +1 -1
scandeval/dataset_configs/catalan.py +1 -1
scandeval/dataset_configs/croatian.py +1 -1
scandeval/dataset_configs/danish.py +3 -2
scandeval/dataset_configs/dutch.py +16 -5
scandeval/dataset_configs/english.py +4 -3
scandeval/dataset_configs/estonian.py +8 -7
scandeval/dataset_configs/faroese.py +1 -1
scandeval/dataset_configs/finnish.py +5 -4
scandeval/dataset_configs/french.py +6 -5
scandeval/dataset_configs/german.py +4 -3
scandeval/dataset_configs/greek.py +1 -1
scandeval/dataset_configs/hungarian.py +1 -1
scandeval/dataset_configs/icelandic.py +4 -3
scandeval/dataset_configs/italian.py +4 -3
scandeval/dataset_configs/latvian.py +2 -2
scandeval/dataset_configs/lithuanian.py +1 -1
scandeval/dataset_configs/norwegian.py +6 -5
scandeval/dataset_configs/polish.py +4 -3
scandeval/dataset_configs/portuguese.py +5 -4
scandeval/dataset_configs/romanian.py +2 -2
scandeval/dataset_configs/serbian.py +1 -1
scandeval/dataset_configs/slovene.py +1 -1
scandeval/dataset_configs/spanish.py +4 -3
scandeval/dataset_configs/swedish.py +4 -3
scandeval/dataset_configs/ukrainian.py +1 -1
scandeval/generation_utils.py +6 -6
scandeval/metrics/__init__.py +1 -0
scandeval/metrics/bias.py +237 -0
scandeval/metrics/huggingface.py +2 -1
scandeval/metrics/llm_as_a_judge.py +1 -1
scandeval/metrics/pipeline.py +1 -1
scandeval/model_cache.py +34 -4
scandeval/prompt_templates/linguistic_acceptability.py +9 -0
scandeval/prompt_templates/multiple_choice.py +9 -0
scandeval/prompt_templates/named_entity_recognition.py +21 -0
scandeval/prompt_templates/reading_comprehension.py +10 -0
scandeval/prompt_templates/sentiment_classification.py +11 -0
scandeval/string_utils.py +157 -0
scandeval/task_group_utils/sequence_classification.py +2 -5
scandeval/task_group_utils/token_classification.py +2 -4
scandeval/tasks.py +22 -0
scandeval/tokenisation_utils.py +12 -1
scandeval/utils.py +13 -383
scandeval-16.13.0.dist-info/METADATA +334 -0
scandeval-16.13.0.dist-info/RECORD +94 -0
scandeval-16.11.0.dist-info/METADATA +0 -649
scandeval-16.11.0.dist-info/RECORD +0 -89
{scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
{scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
{scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0

scandeval/custom_dataset_configs.py ADDED Viewed

@@ -0,0 +1,152 @@
+"""Load custom dataset configs."""
+import importlib.util
+import logging
+from pathlib import Path
+from types import ModuleType
+from huggingface_hub import HfApi
+from .data_models import DatasetConfig
+from .logging_utils import log_once
+from .utils import get_hf_token
+def load_custom_datasets_module(custom_datasets_file: Path) -> ModuleType | None:
+    """Load the custom datasets module if it exists.
+    Args:
+        custom_datasets_file:
+            The path to the custom datasets module.
+    Raises:
+        RuntimeError:
+            If the custom datasets module cannot be loaded.
+    """
+    if custom_datasets_file.exists():
+        spec = importlib.util.spec_from_file_location(
+            name="custom_datasets_module", location=str(custom_datasets_file.resolve())
+        )
+        if spec is None:
+            log_once(
+                "Could not load the spec for the custom datasets file from "
+                f"{custom_datasets_file.resolve()}.",
+                level=logging.ERROR,
+            )
+            return None
+        module = importlib.util.module_from_spec(spec=spec)
+        if spec.loader is None:
+            log_once(
+                "Could not load the module for the custom datasets file from "
+                f"{custom_datasets_file.resolve()}.",
+                level=logging.ERROR,
+            )
+            return None
+        spec.loader.exec_module(module)
+        return module
+    return None
+def try_get_dataset_config_from_repo(
+    dataset_id: str, api_key: str | None, cache_dir: Path
+) -> DatasetConfig | None:
+    """Try to get a dataset config from a Hugging Face dataset repository.
+    Args:
+        dataset_id:
+            The ID of the dataset to get the config for.
+        api_key:
+            The Hugging Face API key to use to check if the repositories have custom
+            dataset configs.
+        cache_dir:
+            The directory to store the cache in.
+    Returns:
+        The dataset config if it exists, otherwise None.
+    """
+    # Check if the dataset ID is a Hugging Face dataset ID, abort if it isn't
+    token = get_hf_token(api_key=api_key)
+    hf_api = HfApi(token=token)
+    if not hf_api.repo_exists(repo_id=dataset_id, repo_type="dataset"):
+        return None
+    # Check if the repository has a euroeval_config.py file, abort if it doesn't
+    repo_files = hf_api.list_repo_files(
+        repo_id=dataset_id, repo_type="dataset", revision="main"
+    )
+    if "euroeval_config.py" not in repo_files:
+        log_once(
+            f"Dataset {dataset_id} does not have a euroeval_config.py file, so we "
+            "cannot load it. Skipping.",
+            level=logging.WARNING,
+        )
+        return None
+    # Fetch the euroeval_config.py file, abort if loading failed
+    external_config_path = cache_dir / "external_dataset_configs" / dataset_id
+    external_config_path.mkdir(parents=True, exist_ok=True)
+    hf_api.hf_hub_download(
+        repo_id=dataset_id,
+        repo_type="dataset",
+        filename="euroeval_config.py",
+        local_dir=external_config_path,
+        local_dir_use_symlinks=False,
+    )
+    module = load_custom_datasets_module(
+        custom_datasets_file=external_config_path / "euroeval_config.py"
+    )
+    if module is None:
+        return None
+    # Check that there is exactly one dataset config, abort if there isn't
+    repo_dataset_configs = [
+        cfg for cfg in vars(module).values() if isinstance(cfg, DatasetConfig)
+    ]
+    if not repo_dataset_configs:
+        return None  # Already warned the user in this case, so we just skip
+    elif len(repo_dataset_configs) > 1:
+        log_once(
+            f"Dataset {dataset_id} has multiple dataset configurations. Please ensure "
+            "that only a single DatasetConfig is defined in the `euroeval_config.py` "
+            "file.",
+            level=logging.WARNING,
+        )
+        return None
+    # Get the dataset split names
+    splits = [
+        split["name"]
+        for split in hf_api.dataset_info(repo_id=dataset_id).card_data.dataset_info[
+            "splits"
+        ]
+    ]
+    train_split_candidates = sorted(
+        [split for split in splits if "train" in split.lower()], key=len
+    )
+    val_split_candidates = sorted(
+        [split for split in splits if "val" in split.lower()], key=len
+    )
+    test_split_candidates = sorted(
+        [split for split in splits if "test" in split.lower()], key=len
+    )
+    train_split = train_split_candidates[0] if train_split_candidates else None
+    val_split = val_split_candidates[0] if val_split_candidates else None
+    test_split = test_split_candidates[0] if test_split_candidates else None
+    if test_split is None:
+        log_once(
+            f"Dataset {dataset_id} does not have a test split, so we cannot load it. "
+            "Please ensure that the dataset has a test split.",
+            level=logging.ERROR,
+        )
+        return None
+    # Set up the config with the repo information
+    repo_dataset_config = repo_dataset_configs[0]
+    repo_dataset_config.name = dataset_id
+    repo_dataset_config.pretty_name = dataset_id
+    repo_dataset_config.source = dataset_id
+    repo_dataset_config.train_split = train_split
+    repo_dataset_config.val_split = val_split
+    repo_dataset_config.test_split = test_split
+    return repo_dataset_config

scandeval/data_loading.py CHANGED Viewed

@@ -9,14 +9,15 @@ import typing as t
 import requests
 from datasets import DatasetDict, load_dataset
 from datasets.exceptions import DatasetsError
-from huggingface_hub.errors import HfHubHTTPError
+from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
 from numpy.random import Generator
 from .constants import SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark
 from .logging_utils import log, no_terminal_output
+from .string_utils import unscramble
 from .tasks import EUROPEAN_VALUES
-from .utils import unscramble
+from .utils import get_hf_token
 if t.TYPE_CHECKING:
     from datasets import Dataset
@@ -47,15 +48,30 @@ def load_data(
             If the Hugging Face Hub is down.
     """
     dataset = load_raw_data(
-        dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
+        dataset_config=dataset_config,
+        cache_dir=benchmark_config.cache_dir,
+        api_key=benchmark_config.api_key,
     )
-    if not benchmark_config.evaluate_test_split and "val" in dataset:
-        dataset["test"] = dataset["val"]
+    if (
+        not benchmark_config.evaluate_test_split
+        and dataset_config.val_split is not None
+    ):
+        dataset[dataset_config.test_split] = dataset[dataset_config.val_split]
+    splits = [
+        split
+        for split in [
+            dataset_config.train_split,
+            dataset_config.val_split,
+            dataset_config.test_split,
+        ]
+        if split is not None
+    ]
     # Remove empty examples from the datasets
     for text_feature in ["tokens", "text"]:
-        for split in dataset_config.splits:
+        for split in splits:
             if text_feature in dataset[split].features:
                 dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
@@ -67,7 +83,7 @@ def load_data(
     # Bootstrap the splits, if applicable
     if dataset_config.bootstrap_samples:
         bootstrapped_splits: dict[str, c.Sequence["Dataset"]] = dict()
-        for split in dataset_config.splits:
+        for split in splits:
             bootstrap_indices = rng.integers(
                 0,
                 len(dataset[split]),
@@ -81,7 +97,12 @@ def load_data(
             DatasetDict(  # type: ignore[no-matching-overload]
                 {
                     split: bootstrapped_splits[split][idx]
-                    for split in dataset_config.splits
+                    for split in [
+                        dataset_config.train_split,
+                        dataset_config.val_split,
+                        dataset_config.test_split,
+                    ]
+                    if split is not None
                 }
             )
             for idx in range(benchmark_config.num_iterations)
@@ -92,7 +113,9 @@ def load_data(
     return datasets
-def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDict":
+def load_raw_data(
+    dataset_config: "DatasetConfig", cache_dir: str, api_key: str | None
+) -> "DatasetDict":
     """Load the raw dataset.
     Args:
@@ -100,6 +123,8 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
             The configuration for the dataset.
         cache_dir:
             The directory to cache the dataset.
+        api_key:
+            The API key to use as the Hugging Face token.
     Returns:
         The dataset.
@@ -125,16 +150,38 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
                 FileNotFoundError,
                 ConnectionError,
                 DatasetsError,
+                RepositoryNotFoundError,
                 requests.ConnectionError,
                 requests.ReadTimeout,
-            ) as e:
-                log(
-                    f"Failed to load dataset {dataset_config.source!r}, due to "
-                    f"the following error: {e}. Retrying...",
-                    level=logging.DEBUG,
-                )
-                time.sleep(1)
-                continue
+            ):
+                try:
+                    with no_terminal_output():
+                        dataset = load_dataset(
+                            path=dataset_config.source.split("::")[0],
+                            name=(
+                                dataset_config.source.split("::")[1]
+                                if "::" in dataset_config.source
+                                else None
+                            ),
+                            cache_dir=cache_dir,
+                            token=get_hf_token(api_key=api_key),
+                        )
+                    break
+                except (
+                    FileNotFoundError,
+                    ConnectionError,
+                    DatasetsError,
+                    RepositoryNotFoundError,
+                    requests.ConnectionError,
+                    requests.ReadTimeout,
+                ) as e:
+                    log(
+                        f"Failed to load dataset {dataset_config.source!r}, due to "
+                        f"the following error: {e}. Retrying...",
+                        level=logging.DEBUG,
+                    )
+                    time.sleep(1)
+                    continue
             except HfHubHTTPError:
                 raise HuggingFaceHubDown()
         else:
@@ -147,17 +194,22 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
     # Case where the dataset source is a dictionary with keys "train", "val" and "test",
     # with the values pointing to local CSV files
     else:
+        split_mapping = dict(
+            train=dataset_config.train_split,
+            val=dataset_config.val_split,
+            test=dataset_config.test_split,
+        )
         data_files = {
-            split: dataset_config.source[split]
-            for split in dataset_config.splits
-            if split in dataset_config.source
+            config_split: dataset_config.source[source_split]
+            for source_split, config_split in split_mapping.items()
+            if source_split in dataset_config.source and config_split is not None
         }
         # Get the file extension and ensure that all files have the same extension
         file_extensions = {
-            split: dataset_config.source[split].split(".")[-1]
-            for split in dataset_config.splits
-            if split in dataset_config.source
+            config_split: dataset_config.source[source_split].split(".")[-1]
+            for source_split, config_split in split_mapping.items()
+            if source_split in dataset_config.source and config_split is not None
         }
         if len(set(file_extensions.values())) != 1:
             raise InvalidBenchmark(
@@ -182,11 +234,15 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
                 path=file_extension, data_files=data_files, cache_dir=cache_dir
             )
-    assert isinstance(dataset, DatasetDict)  # type: ignore[used-before-def]
-    missing_keys = [key for key in dataset_config.splits if key not in dataset]
-    if missing_keys:
-        raise InvalidBenchmark(
-            "The dataset is missing the following required splits: "
-            f"{', '.join(missing_keys)}"
-        )
-    return DatasetDict({key: dataset[key] for key in dataset_config.splits})  # type: ignore[no-matching-overload]
+    assert isinstance(dataset, DatasetDict)
+    return DatasetDict(  # pyrefly: ignore[no-matching-overload]
+        {
+            split: dataset[split]
+            for split in [
+                dataset_config.train_split,
+                dataset_config.val_split,
+                dataset_config.test_split,
+            ]
+            if split is not None
+        }
+    )

ScandEval 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl

ScandEval 16.11.0py3-none-any.whl → 16.13.0py3-none-any.whl