PyPI - EuroEval - Versions diffs - 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl - Mend

EuroEval 16.4.0py3-none-any.whl → 16.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (71) hide show

euroeval/__init__.py +6 -0
euroeval/benchmark_config_factory.py +51 -46
euroeval/benchmark_modules/base.py +6 -5
euroeval/benchmark_modules/hf.py +2 -9
euroeval/benchmark_modules/litellm.py +14 -12
euroeval/benchmark_modules/vllm.py +17 -10
euroeval/benchmarker.py +61 -44
euroeval/caching_utils.py +1 -1
euroeval/cli.py +86 -8
euroeval/constants.py +3 -0
euroeval/data_loading.py +78 -30
euroeval/data_models.py +326 -326
euroeval/dataset_configs/__init__.py +10 -3
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/czech.py +25 -29
euroeval/dataset_configs/danish.py +51 -88
euroeval/dataset_configs/dutch.py +48 -86
euroeval/dataset_configs/english.py +45 -76
euroeval/dataset_configs/estonian.py +36 -38
euroeval/dataset_configs/faroese.py +19 -60
euroeval/dataset_configs/finnish.py +36 -68
euroeval/dataset_configs/french.py +39 -74
euroeval/dataset_configs/german.py +45 -81
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +54 -91
euroeval/dataset_configs/italian.py +42 -78
euroeval/dataset_configs/latvian.py +28 -34
euroeval/dataset_configs/lithuanian.py +22 -26
euroeval/dataset_configs/norwegian.py +72 -114
euroeval/dataset_configs/polish.py +33 -60
euroeval/dataset_configs/portuguese.py +33 -65
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +19 -24
euroeval/dataset_configs/spanish.py +42 -76
euroeval/dataset_configs/swedish.py +48 -84
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/exceptions.py +1 -1
euroeval/finetuning.py +3 -2
euroeval/generation.py +5 -4
euroeval/generation_utils.py +6 -5
euroeval/languages.py +395 -323
euroeval/metrics/huggingface.py +14 -3
euroeval/metrics/llm_as_a_judge.py +1 -1
euroeval/model_cache.py +6 -5
euroeval/model_loading.py +1 -1
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +82 -43
euroeval/prompt_templates/multiple_choice.py +81 -41
euroeval/prompt_templates/named_entity_recognition.py +125 -44
euroeval/prompt_templates/reading_comprehension.py +92 -43
euroeval/prompt_templates/sentiment_classification.py +91 -43
euroeval/prompt_templates/summarization.py +64 -39
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +4 -3
euroeval/speed_benchmark.py +2 -1
euroeval/task_group_utils/multiple_choice_classification.py +2 -1
euroeval/task_group_utils/question_answering.py +24 -13
euroeval/task_group_utils/sequence_classification.py +5 -4
euroeval/task_group_utils/text_to_text.py +2 -1
euroeval/task_group_utils/token_classification.py +11 -8
euroeval/tasks.py +44 -1
euroeval/tokenisation_utils.py +19 -10
euroeval/types.py +10 -9
euroeval/utils.py +6 -3
{euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
euroeval-16.5.0.dist-info/RECORD +81 -0
euroeval-16.4.0.dist-info/RECORD +0 -75
{euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
{euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0

euroeval/cli.py CHANGED Viewed

@@ -1,12 +1,18 @@
 """Command-line interface for benchmarking."""
+import collections.abc as c
+import importlib.util
+import logging
+from pathlib import Path
 import click
 from .benchmarker import Benchmarker
+from .data_models import DatasetConfig, Task
 from .dataset_configs import get_all_dataset_configs
 from .enums import Device, GenerativeType
 from .languages import get_all_languages
-from .tasks import get_all_tasks
+from .logging_utils import log
 @click.command()
@@ -23,7 +29,6 @@ from .tasks import get_all_tasks
     default=None,
     show_default=True,
     multiple=True,
-    type=click.Choice(list(get_all_tasks().keys())),
     help="The dataset tasks to benchmark the model(s) on.",
 )
 @click.option(
@@ -65,7 +70,6 @@ from .tasks import get_all_tasks
     default=None,
     show_default=True,
     multiple=True,
-    type=click.Choice(list(get_all_dataset_configs().keys())),
     help="""The name of the benchmark dataset. We recommend to use the `task` and
     `language` options instead of this option.""",
 )
@@ -222,9 +226,17 @@ from .tasks import get_all_tasks
     help="Only download the requested model weights and datasets, and exit.",
     default=False,
 )
+@click.option(
+    "--custom-datasets-file",
+    type=click.Path(exists=False, dir_okay=False, path_type=Path),
+    default="custom_datasets.py",
+    show_default=True,
+    help="A path to a Python file containing DatasetConfig definitions for custom "
+    "datasets.",
+)
 def benchmark(
     model: tuple[str],
-    dataset: tuple[str],
+    dataset: tuple[str | DatasetConfig],
     language: tuple[str],
     model_language: tuple[str],
     dataset_language: tuple[str],
@@ -250,26 +262,92 @@ def benchmark(
     requires_safetensors: bool,
     generative_type: str | None,
     download_only: bool,
+    custom_datasets_file: Path,
 ) -> None:
     """Benchmark pretrained language models on language tasks."""
     models = list(model)
-    datasets = None if len(dataset) == 0 else list(dataset)
+    datasets: c.Sequence[str | DatasetConfig] | None = (
+        None if len(dataset) == 0 else list(dataset)
+    )
     languages: list[str] = list(language)
     model_languages = None if len(model_language) == 0 else list(model_language)
     dataset_languages = None if len(dataset_language) == 0 else list(dataset_language)
-    tasks = None if len(task) == 0 else list(task)
+    tasks: c.Sequence[str | Task] | None = None if len(task) == 0 else list(task)
     batch_size_int = int(batch_size)
     device = Device[device.upper()] if device is not None else None
     generative_type_obj = (
         GenerativeType[generative_type.upper()] if generative_type else None
     )
+    # Load all defined DatasetConfig and Task objects from the custom datasets file
+    if custom_datasets_file.exists():
+        # Load the custom module
+        spec = importlib.util.spec_from_file_location(
+            name="custom_datasets_module", location=str(custom_datasets_file.resolve())
+        )
+        if spec is None:
+            raise RuntimeError(
+                "Could not load the spec for the custom datasets file from "
+                f"{custom_datasets_file.resolve()}."
+            )
+        module = importlib.util.module_from_spec(spec=spec)
+        if spec.loader is None:
+            raise RuntimeError(
+                "Could not load the module for the custom datasets file from "
+                f"{custom_datasets_file.resolve()}."
+            )
+        spec.loader.exec_module(module)
+        # Load all the custom dataset configurations from the module
+        custom_dataset_configs: list[DatasetConfig] = [
+            obj for obj in vars(module).values() if isinstance(obj, DatasetConfig)
+        ]
+        # If the user has not specified any datasets or tasks, we just use all the usual
+        # datasets as well as all the custom ones that we loaded
+        if datasets is None and tasks is None:
+            datasets = custom_dataset_configs + list(get_all_dataset_configs().values())
+            datasets = [ds for ds in datasets if not ds.unofficial]
+        # If the user has specified only datasets, then we replace the custom dataset
+        # names that the user specified (if any) with the corresponding dataset configs
+        # that we loaded
+        elif datasets is not None and tasks is None:
+            dataset_name_to_config = {
+                config.name: config for config in custom_dataset_configs
+            }
+            datasets = [
+                dataset_name_to_config.get(ds, ds) if isinstance(ds, str) else ds
+                for ds in datasets
+            ]
+        # If the user has specified only tasks, then we find all the official usual and
+        # custom datasets belonging to that task, and use those. We reset the `tasks`
+        # variable as we're using the `datasets` variable directly instead
+        elif datasets is None and tasks is not None:
+            datasets = custom_dataset_configs + list(get_all_dataset_configs().values())
+            datasets = [
+                ds for ds in datasets if not ds.unofficial and ds.task.name in tasks
+            ]
+            tasks = None
+        # Log the loaded custom datasets and tasks
+        dataset_str = (
+            "the custom dataset"
+            if len(custom_dataset_configs) == 1
+            else f"{len(custom_dataset_configs):,} custom datasets"
+        )
+        log(
+            f"Loaded {dataset_str} from {custom_datasets_file.as_posix()!r}.\n",
+            level=logging.INFO,
+        )
     benchmarker = Benchmarker(
         language=languages,
         model_language=model_languages,
         dataset_language=dataset_languages,
-        task=tasks,
-        dataset=datasets,
+        task=tasks,  # type: ignore[arg-type]
+        dataset=datasets,  # type: ignore[arg-type]
         batch_size=batch_size_int,
         progress_bar=progress_bar,
         save_results=save_results,

euroeval/constants.py CHANGED Viewed

@@ -90,3 +90,6 @@ JSON_STRIP_CHARACTERS = ' {}\n\r":'
 # tasks. We also use this to determine whether we should store logprobs in the model
 # outputs (and cache).
 NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
+# We only allow loading local datasets in these file formats
+SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS = ["csv"]

euroeval/data_loading.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Functions related to the loading of the data."""
+import collections.abc as c
 import logging
 import sys
 import time
@@ -11,6 +12,7 @@ from datasets.exceptions import DatasetsError
 from huggingface_hub.errors import HfHubHTTPError
 from numpy.random import Generator
+from .constants import SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark
 from .logging_utils import log, no_terminal_output
 from .tasks import EUROPEAN_VALUES
@@ -64,7 +66,7 @@ def load_data(
     # Bootstrap the splits, if applicable
     if dataset_config.bootstrap_samples:
-        bootstrapped_splits: dict[str, list["Dataset"]] = dict()
+        bootstrapped_splits: dict[str, c.Sequence["Dataset"]] = dict()
         for split in dataset_config.splits:
             bootstrap_indices = rng.integers(
                 0,
@@ -102,38 +104,84 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
     Returns:
         The dataset.
     """
-    num_attempts = 5
-    for _ in range(num_attempts):
-        try:
-            with no_terminal_output():
-                dataset = load_dataset(
-                    path=dataset_config.huggingface_id,
-                    cache_dir=cache_dir,
-                    token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_"),
+    # Case where the dataset source is a Hugging Face ID
+    if isinstance(dataset_config.source, str):
+        num_attempts = 5
+        for _ in range(num_attempts):
+            try:
+                with no_terminal_output():
+                    dataset = load_dataset(
+                        path=dataset_config.source.split("::")[0],
+                        name=(
+                            dataset_config.source.split("::")[1]
+                            if "::" in dataset_config.source
+                            else None
+                        ),
+                        cache_dir=cache_dir,
+                        token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_"),
+                    )
+                break
+            except (
+                FileNotFoundError,
+                ConnectionError,
+                DatasetsError,
+                requests.ConnectionError,
+                requests.ReadTimeout,
+            ) as e:
+                log(
+                    f"Failed to load dataset {dataset_config.source!r}, due to "
+                    f"the following error: {e}. Retrying...",
+                    level=logging.DEBUG,
                 )
-            break
-        except (
-            FileNotFoundError,
-            ConnectionError,
-            DatasetsError,
-            requests.ConnectionError,
-            requests.ReadTimeout,
-        ) as e:
-            log(
-                f"Failed to load dataset {dataset_config.huggingface_id!r}, due to "
-                f"the following error: {e}. Retrying...",
-                level=logging.DEBUG,
+                time.sleep(1)
+                continue
+            except HfHubHTTPError:
+                raise HuggingFaceHubDown()
+        else:
+            raise InvalidBenchmark(
+                f"Failed to load dataset {dataset_config.source!r} after "
+                f"{num_attempts} attempts. Run with verbose mode to see the individual "
+                "errors."
             )
-            time.sleep(1)
-            continue
-        except HfHubHTTPError:
-            raise HuggingFaceHubDown()
+    # Case where the dataset source is a dictionary with keys "train", "val" and "test",
+    # with the values pointing to local CSV files
     else:
-        raise InvalidBenchmark(
-            f"Failed to load dataset {dataset_config.huggingface_id!r} after "
-            f"{num_attempts} attempts. Run with verbose mode to see the individual "
-            "errors."
-        )
+        data_files = {
+            split: dataset_config.source[split]
+            for split in dataset_config.splits
+            if split in dataset_config.source
+        }
+        # Get the file extension and ensure that all files have the same extension
+        file_extensions = {
+            split: dataset_config.source[split].split(".")[-1]
+            for split in dataset_config.splits
+            if split in dataset_config.source
+        }
+        if len(set(file_extensions.values())) != 1:
+            raise InvalidBenchmark(
+                "All data files in a custom dataset must have the same file extension. "
+                f"Got the extensions {', '.join(file_extensions.values())} for the "
+                f"dataset {dataset_config.name!r}."
+            )
+        file_extension = list(file_extensions.values())[0]
+        # Check that the file extension is supported
+        if file_extension not in SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS:
+            raise InvalidBenchmark(
+                "Unsupported file extension for custom dataset. Supported file "
+                "extensions are "
+                f"{', '.join(SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS)}, but got "
+                f"{file_extension!r}."
+            )
+        # Load the dataset
+        with no_terminal_output():
+            dataset = load_dataset(
+                path=file_extension, data_files=data_files, cache_dir=cache_dir
+            )
     assert isinstance(dataset, DatasetDict)  # type: ignore[used-before-def]
     missing_keys = [key for key in dataset_config.splits if key not in dataset]
     if missing_keys:

EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.4.0py3-none-any.whl → 16.5.0py3-none-any.whl