PyPI - EuroEval - Versions diffs - 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl - Mend

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

euroeval/__init__.py +32 -14
euroeval/benchmark_config_factory.py +92 -180
euroeval/benchmark_modules/base.py +49 -39
euroeval/benchmark_modules/fresh.py +35 -21
euroeval/benchmark_modules/hf.py +280 -244
euroeval/benchmark_modules/litellm.py +752 -312
euroeval/benchmark_modules/vllm.py +570 -268
euroeval/benchmarker.py +651 -528
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +49 -38
euroeval/constants.py +44 -25
euroeval/data_loading.py +111 -55
euroeval/data_models.py +490 -323
euroeval/dataset_configs/__init__.py +26 -4
euroeval/dataset_configs/bosnian.py +39 -0
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/croatian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +78 -50
euroeval/dataset_configs/dutch.py +74 -44
euroeval/dataset_configs/english.py +71 -36
euroeval/dataset_configs/estonian.py +111 -0
euroeval/dataset_configs/faroese.py +25 -18
euroeval/dataset_configs/finnish.py +63 -26
euroeval/dataset_configs/french.py +65 -32
euroeval/dataset_configs/german.py +77 -36
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +68 -57
euroeval/dataset_configs/italian.py +68 -36
euroeval/dataset_configs/latvian.py +87 -0
euroeval/dataset_configs/lithuanian.py +64 -0
euroeval/dataset_configs/norwegian.py +98 -72
euroeval/dataset_configs/polish.py +96 -0
euroeval/dataset_configs/portuguese.py +63 -40
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/slovene.py +56 -0
euroeval/dataset_configs/spanish.py +68 -34
euroeval/dataset_configs/swedish.py +82 -41
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/enums.py +12 -6
euroeval/exceptions.py +21 -1
euroeval/finetuning.py +34 -26
euroeval/generation.py +76 -41
euroeval/generation_utils.py +169 -34
euroeval/languages.py +1020 -188
euroeval/logging_utils.py +268 -0
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +85 -0
euroeval/metrics/huggingface.py +216 -0
euroeval/metrics/llm_as_a_judge.py +260 -0
euroeval/metrics/pipeline.py +289 -0
euroeval/metrics/speed.py +48 -0
euroeval/model_cache.py +40 -21
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +157 -22
euroeval/prompt_templates/multiple_choice.py +159 -17
euroeval/prompt_templates/named_entity_recognition.py +318 -21
euroeval/prompt_templates/reading_comprehension.py +207 -16
euroeval/prompt_templates/sentiment_classification.py +205 -22
euroeval/prompt_templates/summarization.py +122 -22
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +20 -9
euroeval/speed_benchmark.py +11 -12
euroeval/task_group_utils/multiple_choice_classification.py +21 -12
euroeval/task_group_utils/question_answering.py +101 -73
euroeval/task_group_utils/sequence_classification.py +144 -61
euroeval/task_group_utils/text_to_text.py +33 -12
euroeval/task_group_utils/token_classification.py +86 -89
euroeval/tasks.py +75 -16
euroeval/tokenisation_utils.py +603 -0
euroeval/types.py +17 -11
euroeval/utils.py +332 -137
euroeval-16.7.1.dist-info/METADATA +623 -0
euroeval-16.7.1.dist-info/RECORD +84 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -737
euroeval/metrics.py +0 -452
euroeval/tokenization_utils.py +0 -498
euroeval-15.12.0.dist-info/METADATA +0 -285
euroeval-15.12.0.dist-info/RECORD +0 -63
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/data_models.py CHANGED Viewed

@@ -1,85 +1,61 @@
 """Data models used in EuroEval."""
+import collections.abc as c
 import json
 import pathlib
 import re
 import typing as t
+from copy import deepcopy
 from dataclasses import dataclass, field
 import pydantic
 import torch
-from .enums import Device, InferenceBackend, ModelType, TaskGroup
-from .metrics import Metric
+from .enums import Device, GenerativeType, ModelType, TaskGroup
+from .exceptions import InvalidBenchmark
+from .languages import (
+    ENGLISH,
+    EUROPEAN_PORTUGUESE,
+    NORWEGIAN,
+    NORWEGIAN_BOKMÅL,
+    NORWEGIAN_NYNORSK,
+    PORTUGUESE,
+    Language,
+)
+from .metrics.base import Metric
 from .types import ScoreDict
 from .utils import get_package_version
+if t.TYPE_CHECKING:
+    from .enums import InferenceBackend
 @dataclass
-class Language:
-    """A benchmarkable language.
+class PromptConfig:
+    """Configuration for task-specific prompting across languages.
+    Defines the prompt templates needed for evaluating a specific task in a given
+    language.
     Attributes:
-        code:
-            The ISO 639-1 language code of the language.
-        name:
-            The name of the language.
-        and_separator (optional):
-            The word 'and' in the language.
-        or_separator (optional):
-            The word 'or' in the language.
+        default_prompt_prefix:
+            The default prefix to use in the few-shot prompt.
+        default_prompt_template:
+            The default template for the prompt to use when benchmarking the dataset
+            using few-shot evaluation.
+        default_instruction_prompt:
+            The default prompt to use when benchmarking the dataset using
+            instruction-based evaluation.
+        default_prompt_label_mapping:
+            The default mapping from the labels to another phrase which is used as a
+            substitute for the label in few-shot evaluation. If set to "auto", the
+            mapping will be set to a 1:1 mapping between the labels and themselves.
     """
-    code: str
-    name: str
-    _and_separator: str | None = field(repr=False, default=None)
-    _or_separator: str | None = field(repr=False, default=None)
-    def __hash__(self) -> int:
-        """Return a hash of the language."""
-        return hash(self.code)
-    @property
-    def and_separator(self) -> str:
-        """Get the word 'and' in the language.
-        Returns:
-            The word 'and' in the language.
-        Raises:
-            NotImplementedError:
-                If `and_separator` is `None`.
-        """
-        if not self._and_separator:
-            raise NotImplementedError(
-                f"Separator for the word 'and' has not been defined for {self.name}."
-            )
-        return self._and_separator
-    @and_separator.setter
-    def and_separator(self, value: str | None) -> None:
-        self._and_separator = value
-    @property
-    def or_separator(self) -> str:
-        """Get the word 'or' in the language.
-        Returns:
-            The word 'or' in the language.
-        Raises:
-            NotImplementedError:
-                If `or_separator` is `None`.
-        """
-        if not self._or_separator:
-            raise NotImplementedError(
-                f"Separator for the word 'or' has not been defined for {self.name}."
-            )
-        return self._or_separator
-    @or_separator.setter
-    def or_separator(self, value: str | None) -> None:
-        self._or_separator = value
+    default_prompt_prefix: str
+    default_prompt_template: str
+    default_instruction_prompt: str
+    default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
 @dataclass
@@ -104,210 +80,68 @@ class Task:
             using few-shot evaluation.
         default_labels:
             The default labels for datasets using this task.
+        requires_zero_shot (optional):
+            Whether to only allow zero-shot evaluation for this task. If True, the
+            task will not be evaluated using few-shot examples.
+        uses_structured_output (optional):
+            Whether the task uses structured output. If True, the task will return
+            structured output (e.g., BIO tags for NER). Defaults to False.
+        uses_logprobs (optional):
+            Whether the task uses log probabilities. If True, the task will return
+            log probabilities for the generated tokens. Defaults to False.
+        requires_logprobs (optional):
+            Whether the task requires log probabilities. Implies `uses_logprobs`.
+        default_allowed_model_types (optional):
+            A list of model types that are allowed to be evaluated on this task.
+            Defaults to all model types being allowed.
+        default_allowed_generative_types (optional):
+            A list of generative model types that are allowed to be evaluated on this
+            task. If None, all generative model types are allowed. Only relevant if
+            `allowed_model_types` includes generative models.
+        default_allow_invalid_model_outputs (optional):
+            Whether to allow invalid model outputs. This is only relevant for generative
+            models on classification tasks, where the model may generate an output
+            which is not one of the allowed labels. If True, the model output will be
+            mapped to the closest valid label. If False, the model output will be
+            considered incorrect and the evaluation will be aborted. Defaults to True.
     """
+    model_config = pydantic.ConfigDict(
+        protected_namespaces=(), arbitrary_types_allowed=True
+    )
     name: str
     task_group: TaskGroup
-    template_dict: dict["Language", "PromptConfig"]
-    metrics: list[Metric]
+    template_dict: dict[Language, PromptConfig]
+    metrics: c.Sequence[Metric]
     default_num_few_shot_examples: int
     default_max_generated_tokens: int
-    default_labels: list[str]
+    default_labels: c.Sequence[str] | None
+    requires_zero_shot: bool = False
+    uses_structured_output: bool = False
+    uses_logprobs: bool = False
+    requires_logprobs: bool = False
+    default_allowed_model_types: c.Sequence[ModelType] = field(
+        default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
+    )
+    default_allowed_generative_types: c.Sequence[GenerativeType] = field(
+        default_factory=lambda: [
+            GenerativeType.BASE,
+            GenerativeType.INSTRUCTION_TUNED,
+            GenerativeType.REASONING,
+        ]
+    )
+    default_allow_invalid_model_outputs: bool = True
+    def __post_init__(self) -> None:
+        """Post-initialisation checks."""
+        self.uses_logprobs = self.uses_logprobs or self.requires_logprobs
     def __hash__(self) -> int:
         """Return a hash of the task."""
         return hash(self.name)
-@dataclass
-class BenchmarkConfig:
-    """General benchmarking configuration, across datasets and models.
-    Attributes:
-        model_languages:
-            The languages of the models to benchmark.
-        dataset_languages:
-            The languages of the datasets in the benchmark.
-        tasks:
-            The tasks benchmark the model(s) on.
-        datasets:
-            The datasets to benchmark on.
-        batch_size:
-            The batch size to use.
-        raise_errors:
-            Whether to raise errors instead of skipping them.
-        cache_dir:
-            Directory to store cached models and datasets.
-        api_key:
-            The API key to use for a given inference API.
-        force:
-            Whether to force the benchmark to run even if the results are already
-            cached.
-        progress_bar:
-            Whether to show a progress bar.
-        save_results:
-            Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
-        device:
-            The device to use for benchmarking.
-        verbose:
-            Whether to print verbose output.
-        trust_remote_code:
-            Whether to trust remote code when loading models from the Hugging Face Hub.
-        clear_model_cache:
-            Whether to clear the model cache after benchmarking each model.
-        evaluate_test_split:
-            Whether to evaluate on the test split.
-        few_shot:
-            Whether to only evaluate the model using few-shot evaluation. Only relevant
-            if the model is generative.
-        num_iterations:
-            The number of iterations each model should be evaluated for.
-        api_base:
-            The base URL for a given inference API. Only relevant if `model` refers to a
-            model on an inference API.
-        api_version:
-            The version of the API to use. Only relevant if `model` refers to a model on
-            an inference API.
-        gpu_memory_utilization:
-            The GPU memory utilization to use for vLLM. A larger value will result in
-            faster evaluation, but at the risk of running out of GPU memory. Only reduce
-            this if you are running out of GPU memory. Only relevant if the model is
-            generative.
-        debug:
-            Whether to run the benchmark in debug mode.
-        run_with_cli:
-            Whether the benchmark is being run with the CLI.
-        only_allow_safetensors:
-            Whether to only allow models that use the safetensors format.
-    """
-    model_languages: list[Language]
-    dataset_languages: list[Language]
-    tasks: list[Task]
-    datasets: list[str]
-    batch_size: int
-    raise_errors: bool
-    cache_dir: str
-    api_key: str | None
-    force: bool
-    progress_bar: bool
-    save_results: bool
-    device: torch.device
-    verbose: bool
-    trust_remote_code: bool
-    clear_model_cache: bool
-    evaluate_test_split: bool
-    few_shot: bool
-    num_iterations: int
-    api_base: str | None
-    api_version: str | None
-    gpu_memory_utilization: float
-    debug: bool
-    run_with_cli: bool
-    only_allow_safetensors: bool
-class BenchmarkConfigParams(pydantic.BaseModel):
-    """The parameters for the benchmark configuration."""
-    model_config = pydantic.ConfigDict(protected_namespaces=())
-    progress_bar: bool
-    save_results: bool
-    task: str | list[str] | None
-    dataset: str | list[str] | None
-    language: str | list[str]
-    model_language: str | list[str] | None
-    dataset_language: str | list[str] | None
-    device: Device | None
-    batch_size: int
-    raise_errors: bool
-    cache_dir: str
-    api_key: str | None
-    force: bool
-    verbose: bool
-    trust_remote_code: bool
-    clear_model_cache: bool
-    evaluate_test_split: bool
-    few_shot: bool
-    num_iterations: int
-    api_base: str | None
-    api_version: str | None
-    gpu_memory_utilization: float
-    debug: bool
-    run_with_cli: bool
-    only_allow_safetensors: bool
-class BenchmarkResult(pydantic.BaseModel):
-    """A benchmark result."""
-    dataset: str
-    task: str
-    dataset_languages: list[str]
-    model: str
-    results: ScoreDict
-    num_model_parameters: int
-    max_sequence_length: int
-    vocabulary_size: int
-    merge: bool
-    generative: bool
-    generative_type: str | None
-    few_shot: bool
-    validation_split: bool
-    euroeval_version: str | None = get_package_version("euroeval")
-    transformers_version: str | None = get_package_version("transformers")
-    torch_version: str | None = get_package_version("torch")
-    vllm_version: str | None = get_package_version("vllm")
-    outlines_version: str | None = get_package_version("outlines")
-    @classmethod
-    def from_dict(cls, config: dict) -> "BenchmarkResult":
-        """Create a benchmark result from a dictionary.
-        Args:
-            config:
-                The configuration dictionary.
-        Returns:
-            The benchmark result.
-        """
-        # To be backwards compatible, we accept old results which changed the model
-        # name with parameters rather than adding them as explicit parameters
-        val_matches = re.search(r"\(.*val.*\)$", config["model"])
-        few_shot_matches = re.search(r"\(.*few-shot.*\)$", config["model"])
-        zero_shot_matches = re.search(r"\(.*zero-shot.*\)$", config["model"])
-        config["model"] = re.sub(
-            r"\(.*(few-shot|val).*\)$", "", config["model"]
-        ).strip()
-        if "merge" not in config:
-            config["merge"] = False
-        if "generative" not in config:
-            config["generative"] = (
-                few_shot_matches is not None or zero_shot_matches is not None
-            )
-        if "generative_type" not in config:
-            config["generative_type"] = None
-        if "few_shot" not in config:
-            config["few_shot"] = zero_shot_matches is None
-        if "validation_split" not in config:
-            config["validation_split"] = val_matches is not None
-        return cls(**config)
-    def append_to_results(self, results_path: pathlib.Path) -> None:
-        """Append the benchmark result to the results file.
-        Args:
-            results_path:
-                The path to the results file.
-        """
-        json_str = json.dumps(self.model_dump())
-        with results_path.open("a") as f:
-            f.write("\n" + json_str)
 @dataclass
 class DatasetConfig:
     """Configuration for a dataset.
@@ -318,8 +152,9 @@ class DatasetConfig:
         pretty_name:
             A longer prettier name for the dataset, which allows cases and spaces. Used
             for logging.
-        huggingface_id:
-            The Hugging Face ID of the dataset.
+        source:
+            The source of the dataset, which can be a Hugging Face ID or a dictionary
+            with keys "train", "val" and "test" mapping to local CSV file paths.
         task:
             The task of the dataset.
         languages:
@@ -356,63 +191,154 @@ class DatasetConfig:
             to a 1:1 mapping between the labels and themselves. If None then the mapping
             will be set to the default mapping for the task and language. Defaults to
             None.
+        _allowed_model_types (optional):
+            A list of model types that are allowed to be evaluated on this dataset.
+            Defaults to the one for the task.
+        _allowed_generative_types (optional):
+            A list of generative model types that are allowed to be evaluated on this
+            dataset. If None, all generative model types are allowed. Only relevant if
+            `allowed_model_types` includes generative models. Defaults to the one for
+            the task.
+        _allow_invalid_model_outputs (optional):
+            Whether to allow invalid model outputs. This is only relevant for
+            generative models on classification tasks, where the model may generate an
+            output which is not one of the allowed labels. If True, the model output
+            will be mapped to the closest valid label. If False, the model output will
+            be considered incorrect and the evaluation will be aborted. Defaults to
+            the one for the task.
+        _logging_string (optional):
+            The string used to describe evaluation on the dataset in logging. If not
+            provided, a default string will be generated, based on the pretty name. Only
+            use this if the default string is not suitable.
+        splits (optional):
+            The names of the splits in the dataset. If not provided, defaults to
+            ["train", "val", "test"].
+        bootstrap_samples (optional):
+            Whether to bootstrap the dataset samples. Defaults to True.
         unofficial (optional):
             Whether the dataset is unofficial. Defaults to False.
     """
     name: str
     pretty_name: str
-    huggingface_id: str
+    source: str | dict[str, str]
     task: Task
-    languages: list[Language]
+    languages: c.Sequence[Language]
     _prompt_prefix: str | None = None
     _prompt_template: str | None = None
     _instruction_prompt: str | None = None
     _num_few_shot_examples: int | None = None
     _max_generated_tokens: int | None = None
-    _labels: list[str] | None = None
+    _labels: c.Sequence[str] | None = None
     _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
+    _allowed_model_types: c.Sequence[ModelType] | None = None
+    _allowed_generative_types: c.Sequence[GenerativeType] | None = None
+    _allow_invalid_model_outputs: bool | None = None
+    _logging_string: str | None = None
+    splits: c.Sequence[str] = field(default_factory=lambda: ["train", "val", "test"])
+    bootstrap_samples: bool = True
     unofficial: bool = False
+    @property
+    def main_language(self) -> Language:
+        """Get the main language of the dataset.
+        Returns:
+            The main language.
+        """
+        match len(self.languages):
+            case 0:
+                raise InvalidBenchmark(
+                    f"Dataset {self.name!r} must have at least one language."
+                )
+            case 1:
+                return self.languages[0]
+            case _:
+                if ENGLISH in self.languages:
+                    return ENGLISH
+                elif NORWEGIAN in self.languages:
+                    return NORWEGIAN
+                elif PORTUGUESE in self.languages:
+                    return PORTUGUESE
+                else:
+                    return self.languages[0]
+    @property
+    def logging_string(self) -> str:
+        """The string used to describe evaluation on the dataset in logging."""
+        if self._logging_string is not None:
+            return self._logging_string
+        truncated_str = (
+            "truncated version of the "
+            if isinstance(self.source, str) and self.source.endswith("-mini")
+            else ""
+        )
+        logging_languages = list(deepcopy(self.languages))
+        if len(self.languages) > 1:
+            if (
+                NORWEGIAN_BOKMÅL in self.languages
+                and NORWEGIAN_NYNORSK in self.languages
+                and NORWEGIAN in self.languages
+            ):
+                logging_languages.remove(NORWEGIAN_BOKMÅL)
+                logging_languages.remove(NORWEGIAN_NYNORSK)
+            elif (
+                NORWEGIAN_BOKMÅL in self.languages
+                or NORWEGIAN_NYNORSK in self.languages
+            ) and NORWEGIAN in self.languages:
+                logging_languages.remove(NORWEGIAN)
+            if PORTUGUESE in self.languages and EUROPEAN_PORTUGUESE in self.languages:
+                logging_languages.remove(EUROPEAN_PORTUGUESE)
+        if len(logging_languages) > 1:
+            languages_str = (
+                ", ".join([lang.name for lang in logging_languages[:-1]])
+                + f" and {logging_languages[-1].name}"
+            )
+        else:
+            languages_str = logging_languages[0].name
+        task_str = self.task.name.replace("-", " ")
+        dataset_name_str = (
+            self.pretty_name or self.name.replace("-", " ").replace("_", " ").title()
+        )
+        return (
+            f"the {truncated_str}{languages_str} {task_str} dataset {dataset_name_str}"
+        )
     @property
     def prompt_prefix(self) -> str:
         """The prefix to use in the few-shot prompt."""
-        main_language = self.languages[0]
-        prompt_config = self.task.template_dict[main_language]
+        prompt_config = self.task.template_dict[self.main_language]
         prompt_prefix = (
             prompt_config.default_prompt_prefix
             if self._prompt_prefix is None
             else self._prompt_prefix
         )
-        prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
         return prompt_prefix
     @property
     def prompt_template(self) -> str:
         """The template used during few-shot evaluation."""
-        main_language = self.languages[0]
-        prompt_config = self.task.template_dict[main_language]
+        prompt_config = self.task.template_dict[self.main_language]
         prompt_template = (
             prompt_config.default_prompt_template
             if self._prompt_template is None
             else self._prompt_template
         )
-        prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
         return prompt_template
     @property
     def instruction_prompt(self) -> str:
         """The prompt to use when evaluating instruction-tuned models."""
-        main_language = self.languages[0]
-        prompt_config = self.task.template_dict[main_language]
+        prompt_config = self.task.template_dict[self.main_language]
         instruction_prompt = (
             prompt_config.default_instruction_prompt
             if self._instruction_prompt is None
             else self._instruction_prompt
         )
-        instruction_prompt = instruction_prompt.replace(
-            "{labels_str}", self._labels_str
-        )
         return instruction_prompt
     @property
@@ -434,9 +360,18 @@ class DatasetConfig:
         )
     @property
-    def labels(self) -> list[str]:
+    def labels(self) -> c.Sequence[str]:
         """The labels in the dataset."""
-        return self._labels if self._labels is not None else self.task.default_labels
+        if self._labels is not None:
+            return self._labels
+        elif self.task.default_labels is not None:
+            return self.task.default_labels
+        else:
+            raise ValueError(
+                f"Labels must be specified for dataset {self.name!r} with the "
+                f"attribute `_labels`, as the task {self.task.name!r} does not have "
+                "default labels."
+            )
     @property
     def prompt_label_mapping(self) -> dict[str, str]:
@@ -445,24 +380,48 @@ class DatasetConfig:
             return {label: label for label in self.labels}
         elif self._prompt_label_mapping is not None:
             return self._prompt_label_mapping
-        main_language = self.languages[0]
-        prompt_config = self.task.template_dict[main_language]
+        prompt_config = self.task.template_dict[self.main_language]
         if prompt_config.default_prompt_label_mapping == "auto":
             return {label: label for label in self.labels}
         else:
             return prompt_config.default_prompt_label_mapping
     @property
-    def id2label(self) -> dict[int, str]:
+    def allowed_model_types(self) -> c.Sequence[ModelType]:
+        """A list of model types that are allowed to be evaluated on this dataset."""
+        return (
+            self._allowed_model_types
+            if self._allowed_model_types is not None
+            else self.task.default_allowed_model_types
+        )
+    @property
+    def allowed_generative_types(self) -> c.Sequence[GenerativeType]:
+        """A list of generative model types that are allowed on this dataset."""
+        return (
+            self._allowed_generative_types
+            if self._allowed_generative_types is not None
+            else self.task.default_allowed_generative_types
+        )
+    @property
+    def allow_invalid_model_outputs(self) -> bool:
+        """Whether to allow invalid model outputs."""
+        return (
+            self._allow_invalid_model_outputs
+            if self._allow_invalid_model_outputs is not None
+            else self.task.default_allow_invalid_model_outputs
+        )
+    @property
+    def id2label(self) -> "HashableDict":
         """The mapping from ID to label."""
-        return {idx: label for idx, label in enumerate(self.labels)}
+        return HashableDict({idx: label for idx, label in enumerate(self.labels)})
     @property
-    def label2id(self) -> dict[str, int]:
+    def label2id(self) -> "HashableDict":
         """The mapping from label to ID."""
-        return {label: i for i, label in enumerate(self.labels)}
+        return HashableDict({label: i for i, label in enumerate(self.labels)})
     @property
     def num_labels(self) -> int:
@@ -473,36 +432,36 @@ class DatasetConfig:
         """Return a hash of the dataset configuration."""
         return hash(self.name)
-    @property
-    def _labels_str(self) -> str:
+    def get_labels_str(self, labels: c.Sequence[str] | None = None) -> str:
         """Converts a set of labels to a natural string, in the specified language.
         If the task is NER, we separate using 'and' and use the mapped labels instead of
         the BIO NER labels.
         Args:
-            language: The language to be used when converting the labels.
+            labels (optional):
+                The labels to convert to a natural string. If None, uses all the labels
+                in the dataset. Defaults to None.
         Returns:
             The natural string representation of the labels in specified language.
         """
-        main_language = self.languages[0]
         if self.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
-            sep_word = main_language.and_separator
+            sep_word = self.main_language.and_separator
         else:
-            sep_word = main_language.or_separator
+            sep_word = self.main_language.or_separator
-        local_labels: list[str] = []
-        for label in self.labels:
-            if label not in self.prompt_label_mapping:
-                continue
-            local_label = self.prompt_label_mapping[label]
-            if local_label not in local_labels:
-                local_labels.append(local_label)
+        if labels is None:
+            labels = list()
+            for english_label in self.labels:
+                if english_label not in self.prompt_label_mapping:
+                    continue
+                label = self.prompt_label_mapping[english_label]
+                if label not in labels:
+                    labels.append(label)
         # Convert labels to single-quoted labels - and remove duplicates
-        quoted_labels = [f"'{label}'" for label in local_labels]
+        quoted_labels = [f"'{label}'" for label in labels]
         if not quoted_labels:
             return ""
@@ -514,6 +473,213 @@ class DatasetConfig:
             return f"{', '.join(quoted_labels[:-1])} {sep_word} {quoted_labels[-1]}"
+@dataclass
+class BenchmarkConfig:
+    """General benchmarking configuration, across datasets and models.
+    Attributes:
+        datasets:
+            The datasets to benchmark on.
+        finetuning_batch_size:
+            The batch size to use for finetuning.
+        raise_errors:
+            Whether to raise errors instead of skipping them.
+        cache_dir:
+            Directory to store cached models and datasets.
+        api_key:
+            The API key to use for a given inference API.
+        api_base:
+            The base URL for a given inference API. Only relevant if `model` refers to a
+            model on an inference API.
+        api_version:
+            The version of the API to use. Only relevant if `model` refers to a model on
+            an inference API.
+        progress_bar:
+            Whether to show a progress bar.
+        save_results:
+            Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
+        device:
+            The device to use for benchmarking.
+        trust_remote_code:
+            Whether to trust remote code when loading models from the Hugging Face Hub.
+        clear_model_cache:
+            Whether to clear the model cache after benchmarking each model.
+        evaluate_test_split:
+            Whether to evaluate on the test split.
+        few_shot:
+            Whether to only evaluate the model using few-shot evaluation. Only relevant
+            if the model is generative.
+        num_iterations:
+            The number of iterations each model should be evaluated for.
+        gpu_memory_utilization:
+            The GPU memory utilization to use for vLLM. A larger value will result in
+            faster evaluation, but at the risk of running out of GPU memory. Only reduce
+            this if you are running out of GPU memory. Only relevant if the model is
+            generative.
+        requires_safetensors:
+            Whether to only allow models that use the safetensors format.
+        generative_type:
+            The type of generative model to benchmark. Only relevant if the model is
+            generative.
+        download_only:
+            Whether to only download the models, metrics and datasets without
+            evaluating.
+        force:
+            Whether to force the benchmark to run even if the results are already
+            cached.
+        verbose:
+            Whether to print verbose output.
+        debug:
+            Whether to run the benchmark in debug mode.
+        run_with_cli:
+            Whether the benchmark is being run with the CLI.
+    """
+    datasets: c.Sequence[DatasetConfig]
+    languages: c.Sequence[Language]
+    finetuning_batch_size: int
+    raise_errors: bool
+    cache_dir: str
+    api_key: str | None
+    api_base: str | None
+    api_version: str | None
+    progress_bar: bool
+    save_results: bool
+    device: torch.device
+    trust_remote_code: bool
+    clear_model_cache: bool
+    evaluate_test_split: bool
+    few_shot: bool
+    num_iterations: int
+    gpu_memory_utilization: float
+    requires_safetensors: bool
+    generative_type: GenerativeType | None
+    download_only: bool
+    force: bool
+    verbose: bool
+    debug: bool
+    run_with_cli: bool
+    @property
+    def tasks(self) -> c.Sequence[Task]:
+        """Get the tasks in the benchmark configuration."""
+        return list({dataset_config.task for dataset_config in self.datasets})
+    def __post_init__(self) -> None:
+        """Post-initialisation checks."""
+        # Set dummy API key if it has not been set and we're benchmarking a model on an
+        # inference API
+        if self.api_key is None and self.api_base is not None:
+            self.api_key = "dummy"
+class BenchmarkConfigParams(pydantic.BaseModel):
+    """The parameters for the benchmark configuration."""
+    model_config = pydantic.ConfigDict(
+        protected_namespaces=(), arbitrary_types_allowed=True
+    )
+    task: str | Task | c.Sequence[str | Task] | None
+    dataset: str | DatasetConfig | c.Sequence[str | DatasetConfig] | None
+    progress_bar: bool
+    save_results: bool
+    language: str | c.Sequence[str]
+    device: Device | None
+    finetuning_batch_size: int
+    raise_errors: bool
+    cache_dir: str
+    api_key: str | None
+    api_base: str | None
+    api_version: str | None
+    trust_remote_code: bool
+    clear_model_cache: bool
+    evaluate_test_split: bool
+    few_shot: bool
+    num_iterations: int
+    requires_safetensors: bool
+    download_only: bool
+    gpu_memory_utilization: float
+    generative_type: GenerativeType | None
+    force: bool
+    verbose: bool
+    debug: bool
+    run_with_cli: bool
+class BenchmarkResult(pydantic.BaseModel):
+    """A benchmark result."""
+    dataset: str
+    task: str
+    languages: c.Sequence[str]
+    model: str
+    results: ScoreDict
+    num_model_parameters: int
+    max_sequence_length: int
+    vocabulary_size: int
+    merge: bool
+    generative: bool
+    generative_type: str | None
+    few_shot: bool
+    validation_split: bool
+    euroeval_version: str | None = get_package_version("euroeval")
+    transformers_version: str | None = get_package_version("transformers")
+    torch_version: str | None = get_package_version("torch")
+    vllm_version: str | None = get_package_version("vllm")
+    xgrammar_version: str | None = get_package_version("xgrammar")
+    @classmethod
+    def from_dict(cls, config: dict) -> "BenchmarkResult":
+        """Create a benchmark result from a dictionary.
+        Args:
+            config:
+                The configuration dictionary.
+        Returns:
+            The benchmark result.
+        """
+        # To be backwards compatible, we accept old results which changed the model
+        # name with parameters rather than adding them as explicit parameters
+        val_matches = re.search(r"\(.*val.*\)$", config["model"])
+        few_shot_matches = re.search(r"\(.*few-shot.*\)$", config["model"])
+        zero_shot_matches = re.search(r"\(.*zero-shot.*\)$", config["model"])
+        config["model"] = re.sub(
+            r"\(.*(few-shot|val).*\)$", "", config["model"]
+        ).strip()
+        if "merge" not in config:
+            config["merge"] = False
+        if "generative" not in config:
+            config["generative"] = (
+                few_shot_matches is not None or zero_shot_matches is not None
+            )
+        if "generative_type" not in config:
+            config["generative_type"] = None
+        if "few_shot" not in config:
+            config["few_shot"] = zero_shot_matches is None
+        if "validation_split" not in config:
+            config["validation_split"] = val_matches is not None
+        # Backwards compatibility
+        if "dataset_languages" in config:
+            config["languages"] = config.pop("dataset_languages")
+        return cls(**config)
+    def append_to_results(self, results_path: pathlib.Path) -> None:
+        """Append the benchmark result to the results file.
+        Args:
+            results_path:
+                The path to the results file.
+        """
+        json_str = json.dumps(self.model_dump())
+        with results_path.open("a") as f:
+            f.write("\n" + json_str)
 @dataclass
 class ModelConfig:
     """Configuration for a model.
@@ -523,6 +689,8 @@ class ModelConfig:
             The ID of the model.
         revision:
             The revision of the model.
+        param:
+            The parameter of the model, or None if the model has no parameters.
         task:
             The task that the model was trained on.
         languages:
@@ -544,9 +712,10 @@ class ModelConfig:
     model_id: str
     revision: str
+    param: str | None
     task: str
-    languages: list[Language]
-    inference_backend: InferenceBackend
+    languages: c.Sequence[Language]
+    inference_backend: "InferenceBackend"
     merge: bool
     model_type: ModelType
     fresh: bool
@@ -573,7 +742,7 @@ class PreparedModelInputs:
             instead.
     """
-    texts: list[str] | None = None
+    texts: c.Sequence[str] | None = None
     input_ids: torch.Tensor | None = None
     attention_mask: torch.Tensor | None = None
@@ -591,8 +760,8 @@ class GenerativeModelOutput:
             token and its logprob. Can be None if the scores are not available.
     """
-    sequences: list[str]
-    scores: list[list[list[tuple[str, float]]]] | None = None
+    sequences: c.Sequence[str]
+    scores: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]] | None = None
 @dataclass
@@ -609,7 +778,7 @@ class SingleGenerativeModelOutput:
     """
     sequence: str
-    scores: list[list[tuple[str, float]]] | None = None
+    scores: c.Sequence[c.Sequence[tuple[str, float]]] | None = None
 @dataclass
@@ -627,33 +796,31 @@ class HFModelInfo:
     """
     pipeline_tag: str
-    tags: list[str]
+    tags: c.Sequence[str]
     adapter_base_model_id: str | None
 @dataclass
-class PromptConfig:
-    """Configuration for task-specific prompting across languages.
-    Defines the prompt templates needed for evaluating a specific task in a given
-    language.
+class ModelIdComponents:
+    """A model ID split into its components.
     Attributes:
-        default_prompt_prefix:
-            The default prefix to use in the few-shot prompt.
-        default_prompt_template:
-            The default template for the prompt to use when benchmarking the dataset
-            using few-shot evaluation.
-        default_instruction_prompt:
-            The default prompt to use when benchmarking the dataset using
-            instruction-based evaluation.
-        default_prompt_label_mapping:
-            The default mapping from the labels to another phrase which is used as a
-            substitute for the label in few-shot evaluation. If set to "auto", the
-            mapping will be set to a 1:1 mapping between the labels and themselves.
+        model_id:
+            The main model ID without revision or parameters.
+        revision:
+            The revision of the model, if any.
+        param:
+            The parameter of the model, if any.
     """
-    default_prompt_prefix: str
-    default_prompt_template: str
-    default_instruction_prompt: str
-    default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
+    model_id: str
+    revision: str
+    param: str | None
+class HashableDict(dict):
+    """A hashable dictionary."""
+    def __hash__(self) -> int:  # type: ignore[override]
+        """Return the hash of the dictionary."""
+        return hash(frozenset(self.items()))

EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl