PyPI - ScandEval - Versions diffs - 16.12.0__py3-none-any.whl → 16.13.0__py3-none-any.whl - Mend

ScandEval 16.12.0py3-none-any.whl → 16.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

scandeval/async_utils.py +46 -0
scandeval/benchmark_config_factory.py +26 -2
scandeval/benchmark_modules/fresh.py +2 -1
scandeval/benchmark_modules/hf.py +50 -12
scandeval/benchmark_modules/litellm.py +25 -15
scandeval/benchmark_modules/vllm.py +3 -3
scandeval/benchmarker.py +15 -33
scandeval/cli.py +2 -4
scandeval/constants.py +5 -0
scandeval/custom_dataset_configs.py +152 -0
scandeval/data_loading.py +87 -31
scandeval/data_models.py +396 -225
scandeval/dataset_configs/__init__.py +51 -25
scandeval/dataset_configs/albanian.py +1 -1
scandeval/dataset_configs/belarusian.py +47 -0
scandeval/dataset_configs/bulgarian.py +1 -1
scandeval/dataset_configs/catalan.py +1 -1
scandeval/dataset_configs/croatian.py +1 -1
scandeval/dataset_configs/danish.py +3 -2
scandeval/dataset_configs/dutch.py +7 -6
scandeval/dataset_configs/english.py +4 -3
scandeval/dataset_configs/estonian.py +8 -7
scandeval/dataset_configs/faroese.py +1 -1
scandeval/dataset_configs/finnish.py +5 -4
scandeval/dataset_configs/french.py +6 -5
scandeval/dataset_configs/german.py +4 -3
scandeval/dataset_configs/greek.py +1 -1
scandeval/dataset_configs/hungarian.py +1 -1
scandeval/dataset_configs/icelandic.py +4 -3
scandeval/dataset_configs/italian.py +4 -3
scandeval/dataset_configs/latvian.py +2 -2
scandeval/dataset_configs/lithuanian.py +1 -1
scandeval/dataset_configs/norwegian.py +6 -5
scandeval/dataset_configs/polish.py +4 -3
scandeval/dataset_configs/portuguese.py +5 -4
scandeval/dataset_configs/romanian.py +2 -2
scandeval/dataset_configs/serbian.py +1 -1
scandeval/dataset_configs/slovene.py +1 -1
scandeval/dataset_configs/spanish.py +4 -3
scandeval/dataset_configs/swedish.py +4 -3
scandeval/dataset_configs/ukrainian.py +1 -1
scandeval/generation_utils.py +6 -6
scandeval/metrics/llm_as_a_judge.py +1 -1
scandeval/metrics/pipeline.py +1 -1
scandeval/model_cache.py +34 -4
scandeval/prompt_templates/linguistic_acceptability.py +9 -0
scandeval/prompt_templates/multiple_choice.py +9 -0
scandeval/prompt_templates/named_entity_recognition.py +21 -0
scandeval/prompt_templates/reading_comprehension.py +10 -0
scandeval/prompt_templates/sentiment_classification.py +11 -0
scandeval/string_utils.py +157 -0
scandeval/task_group_utils/sequence_classification.py +2 -5
scandeval/task_group_utils/token_classification.py +2 -4
scandeval/utils.py +6 -323
scandeval-16.13.0.dist-info/METADATA +334 -0
scandeval-16.13.0.dist-info/RECORD +94 -0
scandeval-16.12.0.dist-info/METADATA +0 -667
scandeval-16.12.0.dist-info/RECORD +0 -90
{scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
{scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
{scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0

scandeval/data_loading.py CHANGED Viewed

@@ -9,14 +9,15 @@ import typing as t
 import requests
 from datasets import DatasetDict, load_dataset
 from datasets.exceptions import DatasetsError
-from huggingface_hub.errors import HfHubHTTPError
+from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
 from numpy.random import Generator
 from .constants import SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark
 from .logging_utils import log, no_terminal_output
+from .string_utils import unscramble
 from .tasks import EUROPEAN_VALUES
-from .utils import unscramble
+from .utils import get_hf_token
 if t.TYPE_CHECKING:
     from datasets import Dataset
@@ -47,15 +48,30 @@ def load_data(
             If the Hugging Face Hub is down.
     """
     dataset = load_raw_data(
-        dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
+        dataset_config=dataset_config,
+        cache_dir=benchmark_config.cache_dir,
+        api_key=benchmark_config.api_key,
     )
-    if not benchmark_config.evaluate_test_split and "val" in dataset:
-        dataset["test"] = dataset["val"]
+    if (
+        not benchmark_config.evaluate_test_split
+        and dataset_config.val_split is not None
+    ):
+        dataset[dataset_config.test_split] = dataset[dataset_config.val_split]
+    splits = [
+        split
+        for split in [
+            dataset_config.train_split,
+            dataset_config.val_split,
+            dataset_config.test_split,
+        ]
+        if split is not None
+    ]
     # Remove empty examples from the datasets
     for text_feature in ["tokens", "text"]:
-        for split in dataset_config.splits:
+        for split in splits:
             if text_feature in dataset[split].features:
                 dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
@@ -67,7 +83,7 @@ def load_data(
     # Bootstrap the splits, if applicable
     if dataset_config.bootstrap_samples:
         bootstrapped_splits: dict[str, c.Sequence["Dataset"]] = dict()
-        for split in dataset_config.splits:
+        for split in splits:
             bootstrap_indices = rng.integers(
                 0,
                 len(dataset[split]),
@@ -81,7 +97,12 @@ def load_data(
             DatasetDict(  # type: ignore[no-matching-overload]
                 {
                     split: bootstrapped_splits[split][idx]
-                    for split in dataset_config.splits
+                    for split in [
+                        dataset_config.train_split,
+                        dataset_config.val_split,
+                        dataset_config.test_split,
+                    ]
+                    if split is not None
                 }
             )
             for idx in range(benchmark_config.num_iterations)
@@ -92,7 +113,9 @@ def load_data(
     return datasets
-def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDict":
+def load_raw_data(
+    dataset_config: "DatasetConfig", cache_dir: str, api_key: str | None
+) -> "DatasetDict":
     """Load the raw dataset.
     Args:
@@ -100,6 +123,8 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
             The configuration for the dataset.
         cache_dir:
             The directory to cache the dataset.
+        api_key:
+            The API key to use as the Hugging Face token.
     Returns:
         The dataset.
@@ -125,16 +150,38 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
                 FileNotFoundError,
                 ConnectionError,
                 DatasetsError,
+                RepositoryNotFoundError,
                 requests.ConnectionError,
                 requests.ReadTimeout,
-            ) as e:
-                log(
-                    f"Failed to load dataset {dataset_config.source!r}, due to "
-                    f"the following error: {e}. Retrying...",
-                    level=logging.DEBUG,
-                )
-                time.sleep(1)
-                continue
+            ):
+                try:
+                    with no_terminal_output():
+                        dataset = load_dataset(
+                            path=dataset_config.source.split("::")[0],
+                            name=(
+                                dataset_config.source.split("::")[1]
+                                if "::" in dataset_config.source
+                                else None
+                            ),
+                            cache_dir=cache_dir,
+                            token=get_hf_token(api_key=api_key),
+                        )
+                    break
+                except (
+                    FileNotFoundError,
+                    ConnectionError,
+                    DatasetsError,
+                    RepositoryNotFoundError,
+                    requests.ConnectionError,
+                    requests.ReadTimeout,
+                ) as e:
+                    log(
+                        f"Failed to load dataset {dataset_config.source!r}, due to "
+                        f"the following error: {e}. Retrying...",
+                        level=logging.DEBUG,
+                    )
+                    time.sleep(1)
+                    continue
             except HfHubHTTPError:
                 raise HuggingFaceHubDown()
         else:
@@ -147,17 +194,22 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
     # Case where the dataset source is a dictionary with keys "train", "val" and "test",
     # with the values pointing to local CSV files
     else:
+        split_mapping = dict(
+            train=dataset_config.train_split,
+            val=dataset_config.val_split,
+            test=dataset_config.test_split,
+        )
         data_files = {
-            split: dataset_config.source[split]
-            for split in dataset_config.splits
-            if split in dataset_config.source
+            config_split: dataset_config.source[source_split]
+            for source_split, config_split in split_mapping.items()
+            if source_split in dataset_config.source and config_split is not None
         }
         # Get the file extension and ensure that all files have the same extension
         file_extensions = {
-            split: dataset_config.source[split].split(".")[-1]
-            for split in dataset_config.splits
-            if split in dataset_config.source
+            config_split: dataset_config.source[source_split].split(".")[-1]
+            for source_split, config_split in split_mapping.items()
+            if source_split in dataset_config.source and config_split is not None
         }
         if len(set(file_extensions.values())) != 1:
             raise InvalidBenchmark(
@@ -182,11 +234,15 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
                 path=file_extension, data_files=data_files, cache_dir=cache_dir
             )
-    assert isinstance(dataset, DatasetDict)  # type: ignore[used-before-def]
-    missing_keys = [key for key in dataset_config.splits if key not in dataset]
-    if missing_keys:
-        raise InvalidBenchmark(
-            "The dataset is missing the following required splits: "
-            f"{', '.join(missing_keys)}"
-        )
-    return DatasetDict({key: dataset[key] for key in dataset_config.splits})  # type: ignore[no-matching-overload]
+    assert isinstance(dataset, DatasetDict)
+    return DatasetDict(  # pyrefly: ignore[no-matching-overload]
+        {
+            split: dataset[split]
+            for split in [
+                dataset_config.train_split,
+                dataset_config.val_split,
+                dataset_config.test_split,
+            ]
+            if split is not None
+        }
+    )

ScandEval 16.12.0__py3-none-any.whl → 16.13.0__py3-none-any.whl

ScandEval 16.12.0py3-none-any.whl → 16.13.0py3-none-any.whl