PyPI - EuroEval - Versions diffs - 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl - Mend

EuroEval 16.3.0py3-none-any.whl → 16.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (78) hide show

euroeval/__init__.py +9 -2
euroeval/benchmark_config_factory.py +51 -50
euroeval/benchmark_modules/base.py +9 -21
euroeval/benchmark_modules/fresh.py +2 -1
euroeval/benchmark_modules/hf.py +101 -71
euroeval/benchmark_modules/litellm.py +115 -53
euroeval/benchmark_modules/vllm.py +107 -92
euroeval/benchmarker.py +144 -121
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +86 -8
euroeval/constants.py +9 -0
euroeval/data_loading.py +80 -29
euroeval/data_models.py +338 -330
euroeval/dataset_configs/__init__.py +12 -3
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +55 -93
euroeval/dataset_configs/dutch.py +48 -87
euroeval/dataset_configs/english.py +45 -77
euroeval/dataset_configs/estonian.py +42 -34
euroeval/dataset_configs/faroese.py +19 -60
euroeval/dataset_configs/finnish.py +36 -69
euroeval/dataset_configs/french.py +39 -75
euroeval/dataset_configs/german.py +45 -82
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +54 -91
euroeval/dataset_configs/italian.py +42 -79
euroeval/dataset_configs/latvian.py +28 -35
euroeval/dataset_configs/lithuanian.py +28 -26
euroeval/dataset_configs/norwegian.py +72 -115
euroeval/dataset_configs/polish.py +33 -61
euroeval/dataset_configs/portuguese.py +33 -66
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/spanish.py +42 -77
euroeval/dataset_configs/swedish.py +52 -90
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/exceptions.py +1 -1
euroeval/finetuning.py +24 -17
euroeval/generation.py +15 -14
euroeval/generation_utils.py +8 -8
euroeval/languages.py +395 -323
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +21 -6
euroeval/metrics/llm_as_a_judge.py +6 -4
euroeval/metrics/pipeline.py +17 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +17 -19
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +99 -42
euroeval/prompt_templates/multiple_choice.py +102 -38
euroeval/prompt_templates/named_entity_recognition.py +172 -51
euroeval/prompt_templates/reading_comprehension.py +119 -42
euroeval/prompt_templates/sentiment_classification.py +110 -40
euroeval/prompt_templates/summarization.py +85 -40
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +11 -10
euroeval/speed_benchmark.py +5 -6
euroeval/task_group_utils/multiple_choice_classification.py +2 -4
euroeval/task_group_utils/question_answering.py +24 -16
euroeval/task_group_utils/sequence_classification.py +48 -35
euroeval/task_group_utils/text_to_text.py +19 -9
euroeval/task_group_utils/token_classification.py +21 -17
euroeval/tasks.py +44 -1
euroeval/tokenisation_utils.py +33 -22
euroeval/types.py +10 -9
euroeval/utils.py +35 -149
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
euroeval-16.5.0.dist-info/RECORD +81 -0
euroeval-16.3.0.dist-info/RECORD +0 -71
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -21,7 +21,8 @@ if os.getenv("FULL_LOG") != "1":
     os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
 # Set up logging
-fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
+# fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
+fmt = colored("%(message)s", "light_yellow")
 logging.basicConfig(
     level=logging.CRITICAL if hasattr(sys, "_called_from_test") else logging.INFO,
     format=fmt,
@@ -50,7 +51,13 @@ import importlib.metadata  # noqa: E402
 from dotenv import load_dotenv  # noqa: E402
 from .benchmarker import Benchmarker  # noqa: E402
-from .utils import block_terminal_output  # noqa: E402
+from .data_models import DatasetConfig  # noqa: E402
+from .logging_utils import block_terminal_output  # noqa: E402
+from .tasks import (  # noqa: E402
+    MULTIPLE_CHOICE,
+    TEXT_CLASSIFICATION,
+    TOKEN_CLASSIFICATION,
+)
 # Block unwanted terminal outputs. This blocks way more than the above, but since it
 # relies on importing from the `utils` module, external modules are already imported

euroeval/benchmark_config_factory.py CHANGED Viewed

@@ -1,23 +1,20 @@
 """Factory class for creating dataset configurations."""
-import logging
+import collections.abc as c
 import sys
 import typing as t
 import torch
-from .data_models import BenchmarkConfig, BenchmarkConfigParams
+from .data_models import BenchmarkConfig, BenchmarkConfigParams, DatasetConfig, Task
 from .dataset_configs import get_all_dataset_configs
 from .enums import Device
 from .exceptions import InvalidBenchmark
 from .languages import get_all_languages
-from .tasks import SPEED, get_all_tasks
+from .tasks import get_all_tasks
 if t.TYPE_CHECKING:
-    from .data_models import Language, Task
-logger = logging.getLogger("euroeval")
+    from .data_models import Language
 def build_benchmark_config(
@@ -44,7 +41,7 @@ def build_benchmark_config(
         default_language_codes=language_codes,
     )
-    tasks, datasets = prepare_tasks_and_datasets(
+    dataset_configs = prepare_dataset_configs(
         task=benchmark_config_params.task,
         dataset=benchmark_config_params.dataset,
         dataset_languages=dataset_languages,
@@ -53,8 +50,7 @@ def build_benchmark_config(
     return BenchmarkConfig(
         model_languages=model_languages,
         dataset_languages=dataset_languages,
-        tasks=tasks,
-        datasets=datasets,
+        datasets=dataset_configs,
         batch_size=benchmark_config_params.batch_size,
         raise_errors=benchmark_config_params.raise_errors,
         cache_dir=benchmark_config_params.cache_dir,
@@ -84,7 +80,9 @@ def build_benchmark_config(
     )
-def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
+def get_correct_language_codes(
+    language_codes: str | c.Sequence[str],
+) -> c.Sequence[str]:
     """Get correct language code(s).
     Args:
@@ -105,7 +103,7 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
     elif isinstance(language_codes, str):
         languages = [language_codes]
     else:
-        languages = language_codes
+        languages = list(language_codes)
     # If `languages` contains 'no' then also include 'nb' and 'nn'. Conversely, if
     # either 'nb' or 'nn' are specified then also include 'no'.
@@ -118,8 +116,9 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
 def prepare_languages(
-    language_codes: str | list[str] | None, default_language_codes: list[str]
-) -> list["Language"]:
+    language_codes: str | c.Sequence[str] | None,
+    default_language_codes: c.Sequence[str],
+) -> c.Sequence["Language"]:
     """Prepare language(s) for benchmarking.
     Args:
@@ -137,7 +136,7 @@ def prepare_languages(
     language_mapping = get_all_languages()
     # Create the list `languages_str` of language codes to use for models or datasets
-    languages_str: list[str]
+    languages_str: c.Sequence[str]
     if language_codes is None:
         languages_str = default_language_codes
     elif isinstance(language_codes, str):
@@ -154,12 +153,12 @@ def prepare_languages(
     return prepared_languages
-def prepare_tasks_and_datasets(
-    task: str | list[str] | None,
-    dataset_languages: list["Language"],
-    dataset: str | list[str] | None,
-) -> tuple[list["Task"], list[str]]:
-    """Prepare task(s) and dataset(s) for benchmarking.
+def prepare_dataset_configs(
+    task: "str | Task | c.Sequence[str | Task] | None",
+    dataset_languages: c.Sequence["Language"],
+    dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None",
+) -> c.Sequence["DatasetConfig"]:
+    """Prepare dataset config(s) for benchmarking.
     Args:
         task:
@@ -172,56 +171,58 @@ def prepare_tasks_and_datasets(
             included, limited by the `task` and `dataset_languages` parameters.
     Returns:
-        The prepared tasks and datasets.
+        The prepared dataset configs.
     Raises:
         InvalidBenchmark:
             If the task or dataset is not found in the benchmark tasks or datasets.
     """
-    # Create a dictionary that maps benchmark tasks to their associated benchmark
-    # task objects, and a dictionary that maps dataset names to their associated
-    # dataset configuration objects
-    task_mapping = get_all_tasks()
-    all_dataset_configs = get_all_dataset_configs()
     # Create the list of dataset tasks
+    task_mapping = get_all_tasks()
     try:
         if task is None:
-            tasks = [t for t in task_mapping.values() if t != SPEED]
+            tasks = None
         elif isinstance(task, str):
             tasks = [task_mapping[task]]
+        elif isinstance(task, Task):
+            tasks = [task]
         else:
-            tasks = [task_mapping[t] for t in task]
+            tasks = [task_mapping[t] if isinstance(t, str) else t for t in task]
     except KeyError as e:
         raise InvalidBenchmark(f"Task {e} not found in the benchmark tasks.") from e
-    all_official_datasets = [
-        dataset_name
-        for dataset_name, dataset_config in all_dataset_configs.items()
+    # Create the list of dataset configs
+    all_dataset_configs = get_all_dataset_configs()
+    all_official_dataset_configs: c.Sequence[DatasetConfig] = [
+        dataset_config
+        for dataset_config in all_dataset_configs.values()
         if not dataset_config.unofficial
     ]
-    if dataset is None:
-        dataset = all_official_datasets
-    elif isinstance(dataset, str):
-        dataset = [dataset]
-    all_datasets = list(all_dataset_configs.keys())
-    invalid_datasets = set(dataset) - set(all_datasets)
-    if invalid_datasets:
+    try:
+        if dataset is None:
+            datasets = all_official_dataset_configs
+        elif isinstance(dataset, str):
+            datasets = [all_dataset_configs[dataset]]
+        elif isinstance(dataset, DatasetConfig):
+            datasets = [dataset]
+        else:
+            datasets = [
+                all_dataset_configs[d] if isinstance(d, str) else d for d in dataset
+            ]
+    except KeyError as e:
         raise InvalidBenchmark(
-            f"Dataset(s) {', '.join(invalid_datasets)} not found in the benchmark "
-            "datasets."
-        )
+            f"Dataset {e} not found in the benchmark datasets."
+        ) from e
+    # Filter the dataset configs based on the specified tasks and languages
     datasets = [
-        dataset_name
-        for dataset_name, dataset_config in all_dataset_configs.items()
-        if dataset_name in dataset
-        and dataset_config.task in tasks
-        and set(dataset_config.languages).intersection(dataset_languages)
+        ds
+        for ds in datasets
+        if (tasks is None or ds.task in tasks)
+        and any(lang in dataset_languages for lang in ds.languages)
     ]
-    return tasks, datasets
+    return datasets
 def prepare_device(device: Device | None) -> torch.device:

euroeval/benchmark_modules/base.py CHANGED Viewed

@@ -3,24 +3,22 @@
 import collections.abc as c
 import logging
 import re
-import sys
 import typing as t
 from abc import ABC, abstractmethod
 from functools import cached_property, partial
 from datasets import Dataset, DatasetDict
 from torch import nn
-from tqdm.auto import tqdm
 from ..enums import TaskGroup
 from ..exceptions import InvalidBenchmark, NeedsEnvironmentVariable, NeedsExtraInstalled
+from ..logging_utils import get_pbar, log_once
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
     text_to_text,
     token_classification,
 )
-from ..utils import log_once
 if t.TYPE_CHECKING:
     from transformers.tokenization_utils import PreTrainedTokenizer
@@ -36,8 +34,6 @@ if t.TYPE_CHECKING:
     from ..enums import BatchingPreference, GenerativeType
     from ..types import ComputeMetricsFunction, ExtractLabelsFunction
-logger = logging.getLogger("euroeval")
 class BenchmarkModule(ABC):
     """Abstract class for a benchmark module.
@@ -56,7 +52,7 @@ class BenchmarkModule(ABC):
     fresh_model: bool
     batching_preference: "BatchingPreference"
     high_priority: bool
-    allowed_params: dict[re.Pattern, list[str]] = {re.compile(r".*"): []}
+    allowed_params: dict[re.Pattern, c.Sequence[str]] = {re.compile(r".*"): []}
     def __init__(
         self,
@@ -87,20 +83,12 @@ class BenchmarkModule(ABC):
     def _log_metadata(self) -> None:
         """Log the metadata of the model."""
-        # Set logging level based on verbosity
-        if hasattr(sys, "_called_from_test"):
-            logging_level = logging.CRITICAL
-        elif self.benchmark_config.verbose:
-            logging_level = logging.DEBUG
-        else:
-            logging_level = logging.INFO
-        logger.setLevel(logging_level)
-        logging_msg: str = ""
+        model_id = self.model_config.model_id
+        logging_msg: str = "    ↳ "
         if self.num_params < 0:
-            logging_msg += "The model has an unknown number of parameters, "
+            logging_msg += f"The model {model_id} has an unknown number of parameters, "
         else:
-            logging_msg += f"The model has {self.num_params:,} parameters, "
+            logging_msg += f"The model {model_id} has {self.num_params:,} parameters, "
         if self.vocab_size < 0:
             logging_msg += "an unknown vocabulary size, "
         else:
@@ -179,7 +167,7 @@ class BenchmarkModule(ABC):
     @property
     @abstractmethod
-    def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
+    def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.
         Returns:
@@ -253,7 +241,7 @@ class BenchmarkModule(ABC):
     def prepare_datasets(
         self, datasets: list[DatasetDict], task: "Task"
-    ) -> list[DatasetDict]:
+    ) -> c.Sequence[DatasetDict]:
         """Prepare the datasets for the model.
         This includes things like tokenisation.
@@ -273,7 +261,7 @@ class BenchmarkModule(ABC):
                 tasks.
         """
         for idx, dataset in enumerate(
-            tqdm(iterable=datasets, desc="Preparing datasets")
+            get_pbar(iterable=datasets, desc="Preparing datasets")
         ):
             prepared_dataset = self.prepare_dataset(
                 dataset=dataset, task=task, itr_idx=idx

euroeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -27,7 +27,8 @@ from ..exceptions import (
     NeedsExtraInstalled,
 )
 from ..generation_utils import raise_if_wrong_params
-from ..utils import block_terminal_output, create_model_cache_dir, get_hf_token
+from ..logging_utils import block_terminal_output
+from ..utils import create_model_cache_dir, get_hf_token
 from .hf import (
     HuggingFaceEncoderModel,
     align_model_and_tokeniser,

EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.3.0py3-none-any.whl → 16.5.0py3-none-any.whl