PyPI - ScandEval - Versions diffs - 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl - Mend

ScandEval 16.11.0py3-none-any.whl → 16.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

scandeval/__init__.py +0 -9
scandeval/async_utils.py +46 -0
scandeval/benchmark_config_factory.py +31 -2
scandeval/benchmark_modules/fresh.py +2 -1
scandeval/benchmark_modules/hf.py +76 -23
scandeval/benchmark_modules/litellm.py +33 -15
scandeval/benchmark_modules/vllm.py +97 -44
scandeval/benchmarker.py +29 -33
scandeval/cli.py +11 -0
scandeval/constants.py +36 -2
scandeval/custom_dataset_configs.py +152 -0
scandeval/data_loading.py +87 -31
scandeval/data_models.py +405 -224
scandeval/dataset_configs/__init__.py +51 -25
scandeval/dataset_configs/albanian.py +1 -1
scandeval/dataset_configs/belarusian.py +47 -0
scandeval/dataset_configs/bulgarian.py +1 -1
scandeval/dataset_configs/catalan.py +1 -1
scandeval/dataset_configs/croatian.py +1 -1
scandeval/dataset_configs/danish.py +3 -2
scandeval/dataset_configs/dutch.py +16 -5
scandeval/dataset_configs/english.py +4 -3
scandeval/dataset_configs/estonian.py +8 -7
scandeval/dataset_configs/faroese.py +1 -1
scandeval/dataset_configs/finnish.py +5 -4
scandeval/dataset_configs/french.py +6 -5
scandeval/dataset_configs/german.py +4 -3
scandeval/dataset_configs/greek.py +1 -1
scandeval/dataset_configs/hungarian.py +1 -1
scandeval/dataset_configs/icelandic.py +4 -3
scandeval/dataset_configs/italian.py +4 -3
scandeval/dataset_configs/latvian.py +2 -2
scandeval/dataset_configs/lithuanian.py +1 -1
scandeval/dataset_configs/norwegian.py +6 -5
scandeval/dataset_configs/polish.py +4 -3
scandeval/dataset_configs/portuguese.py +5 -4
scandeval/dataset_configs/romanian.py +2 -2
scandeval/dataset_configs/serbian.py +1 -1
scandeval/dataset_configs/slovene.py +1 -1
scandeval/dataset_configs/spanish.py +4 -3
scandeval/dataset_configs/swedish.py +4 -3
scandeval/dataset_configs/ukrainian.py +1 -1
scandeval/generation_utils.py +6 -6
scandeval/metrics/__init__.py +1 -0
scandeval/metrics/bias.py +237 -0
scandeval/metrics/huggingface.py +2 -1
scandeval/metrics/llm_as_a_judge.py +1 -1
scandeval/metrics/pipeline.py +1 -1
scandeval/model_cache.py +34 -4
scandeval/prompt_templates/linguistic_acceptability.py +9 -0
scandeval/prompt_templates/multiple_choice.py +9 -0
scandeval/prompt_templates/named_entity_recognition.py +21 -0
scandeval/prompt_templates/reading_comprehension.py +10 -0
scandeval/prompt_templates/sentiment_classification.py +11 -0
scandeval/string_utils.py +157 -0
scandeval/task_group_utils/sequence_classification.py +2 -5
scandeval/task_group_utils/token_classification.py +2 -4
scandeval/tasks.py +22 -0
scandeval/tokenisation_utils.py +12 -1
scandeval/utils.py +13 -383
scandeval-16.13.0.dist-info/METADATA +334 -0
scandeval-16.13.0.dist-info/RECORD +94 -0
scandeval-16.11.0.dist-info/METADATA +0 -649
scandeval-16.11.0.dist-info/RECORD +0 -89
{scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
{scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
{scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0

scandeval/data_models.py CHANGED Viewed

@@ -1,7 +1,10 @@
 """Data models used in EuroEval."""
 import collections.abc as c
+import importlib.metadata
+import importlib.util
 import json
+import logging
 import re
 import typing as t
 from copy import deepcopy
@@ -12,6 +15,7 @@ import pydantic
 import torch
 from transformers.generation.configuration_utils import GenerationConfig
+from .constants import ATTENTION_BACKENDS, MAX_NUMBER_OF_LOGGING_LANGUAGES
 from .enums import Device, GenerativeType, ModelType, TaskGroup
 from .exceptions import InvalidBenchmark
 from .languages import (
@@ -23,14 +27,30 @@ from .languages import (
     PORTUGUESE,
     Language,
 )
+from .logging_utils import log_once
 from .metrics.base import Metric
 from .types import ScoreDict
-from .utils import get_package_version
 if t.TYPE_CHECKING:
     from .enums import InferenceBackend
+def get_package_version(package_name: str) -> str | None:
+    """Get the version of a package.
+    Args:
+        package_name:
+            The name of the package.
+    Returns:
+        The version of the package, or None if the package is not installed.
+    """
+    try:
+        return importlib.metadata.version(package_name)
+    except importlib.metadata.PackageNotFoundError:
+        return None
 @dataclass
 class PromptConfig:
     """Configuration for task-specific prompting across languages.
@@ -79,8 +99,9 @@ class Task:
         default_max_generated_tokens:
             The default maximum number of tokens to generate when benchmarking the task
             using few-shot evaluation.
-        default_labels:
-            The default labels for datasets using this task.
+        default_labels (optional):
+            The default labels for datasets using this task. Can be None if the labels
+            should be set manually in the dataset configs. Defaults to an empty tuple.
         requires_zero_shot (optional):
             Whether to only allow zero-shot evaluation for this task. If True, the
             task will not be evaluated using few-shot examples.
@@ -117,7 +138,7 @@ class Task:
     metrics: c.Sequence[Metric]
     default_num_few_shot_examples: int
     default_max_generated_tokens: int
-    default_labels: c.Sequence[str] | None
+    default_labels: c.Sequence[str] | None = tuple()
     requires_zero_shot: bool = False
     uses_structured_output: bool = False
     uses_logprobs: bool = False
@@ -143,133 +164,362 @@ class Task:
         return hash(self.name)
-@dataclass
 class DatasetConfig:
-    """Configuration for a dataset.
+    """Configuration for a dataset."""
+    def __init__(
+        self,
+        task: Task,
+        languages: c.Sequence[Language],
+        name: str | None = None,
+        pretty_name: str | None = None,
+        source: str | dict[str, str] | None = None,
+        prompt_prefix: str | None = None,
+        prompt_template: str | None = None,
+        instruction_prompt: str | None = None,
+        num_few_shot_examples: int | None = None,
+        max_generated_tokens: int | None = None,
+        labels: c.Sequence[str] | None = None,
+        prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None,
+        allowed_model_types: c.Sequence[ModelType] | None = None,
+        allowed_generative_types: c.Sequence[GenerativeType] | None = None,
+        allow_invalid_model_outputs: bool | None = None,
+        train_split: str | None = "train",
+        val_split: str | None = "val",
+        test_split: str = "test",
+        bootstrap_samples: bool = True,
+        unofficial: bool = False,
+        _prompt_prefix: str | None = None,
+        _prompt_template: str | None = None,
+        _instruction_prompt: str | None = None,
+        _num_few_shot_examples: int | None = None,
+        _max_generated_tokens: int | None = None,
+        _labels: c.Sequence[str] | None = None,
+        _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None,
+        _allowed_model_types: c.Sequence[ModelType] | None = None,
+        _allowed_generative_types: c.Sequence[GenerativeType] | None = None,
+        _allow_invalid_model_outputs: bool | None = None,
+        _logging_string: str | None = None,
+    ) -> None:
+        """Initialise a DatasetConfig object.
-    Attributes:
-        name:
-            The name of the dataset. Must be lower case with no spaces.
-        pretty_name:
-            A longer prettier name for the dataset, which allows cases and spaces. Used
-            for logging.
-        source:
-            The source of the dataset, which can be a Hugging Face ID or a dictionary
-            with keys "train", "val" and "test" mapping to local CSV file paths.
-        task:
-            The task of the dataset.
-        languages:
-            The ISO 639-1 language codes of the entries in the dataset.
-        id2label:
-            The mapping from ID to label.
-        label2id:
-            The mapping from label to ID.
-        num_labels:
-            The number of labels in the dataset.
-        _prompt_prefix (optional):
-            The prefix to use in the few-shot prompt. Defaults to the template for the
-            task and language.
-        _prompt_template (optional):
-            The template for the prompt to use when benchmarking the dataset using
-            few-shot evaluation. Defaults to the template for the task and language.
-        _instruction_prompt (optional):
-            The prompt to use when benchmarking the dataset using instruction-based
-            evaluation. Defaults to the template for the task and language.
-        _num_few_shot_examples (optional):
-            The number of examples to use when benchmarking the dataset using few-shot
-            evaluation. For a classification task, these will be drawn evenly from
-            each label. Defaults to the template for the task and language.
-        _max_generated_tokens (optional):
-            The maximum number of tokens to generate when benchmarking the dataset
-            using few-shot evaluation. Defaults to the template for the task and
-            language.
-        _labels (optional):
-            The labels in the dataset. Defaults to the template for the task and
-            language.
-        _prompt_label_mapping (optional):
-            A mapping from the labels to another phrase which is used as a substitute
-            for the label in few-shot evaluation. If "auto" then the mapping will be set
-            to a 1:1 mapping between the labels and themselves. If None then the mapping
-            will be set to the default mapping for the task and language. Defaults to
-            None.
-        _allowed_model_types (optional):
-            A list of model types that are allowed to be evaluated on this dataset.
-            Defaults to the one for the task.
-        _allowed_generative_types (optional):
-            A list of generative model types that are allowed to be evaluated on this
-            dataset. If None, all generative model types are allowed. Only relevant if
-            `allowed_model_types` includes generative models. Defaults to the one for
-            the task.
-        _allow_invalid_model_outputs (optional):
-            Whether to allow invalid model outputs. This is only relevant for
-            generative models on classification tasks, where the model may generate an
-            output which is not one of the allowed labels. If True, the model output
-            will be mapped to the closest valid label. If False, the model output will
-            be considered incorrect and the evaluation will be aborted. Defaults to
-            the one for the task.
-        _logging_string (optional):
-            The string used to describe evaluation on the dataset in logging. If not
-            provided, a default string will be generated, based on the pretty name. Only
-            use this if the default string is not suitable.
-        splits (optional):
-            The names of the splits in the dataset. If not provided, defaults to
-            ["train", "val", "test"].
-        bootstrap_samples (optional):
-            Whether to bootstrap the dataset samples. Defaults to True.
-        unofficial (optional):
-            Whether the dataset is unofficial. Defaults to False.
-    """
+        Args:
+            task:
+                The task of the dataset.
+            languages:
+                The ISO 639-1 language codes of the entries in the dataset.
+            name (optional):
+                The name of the dataset. Must be lower case with no spaces. Can be None
+                if and only if the dataset config resides directly in the Hugging Face
+                dataset repo. Defaults to None.
+            pretty_name (optional):
+                A longer prettier name for the dataset, which allows cases and spaces.
+                Used for logging. Can be None if and only if the dataset config resides
+                directly in the Hugging Face dataset repo. Defaults to None.
+            source (optional):
+                The source of the dataset, which can be a Hugging Face ID or a
+                dictionary with keys "train", "val" and "test" mapping to local CSV file
+                paths. Can be None if and only if the dataset config resides directly in
+                the Hugging Face dataset repo. Defaults to None.
+            prompt_prefix (optional):
+                The prefix to use in the few-shot prompt. Defaults to the template for
+                the task and language.
+            prompt_template (optional):
+                The template for the prompt to use when benchmarking the dataset using
+                few-shot evaluation. Defaults to the template for the task and language.
+            instruction_prompt (optional):
+                The prompt to use when benchmarking the dataset using instruction-based
+                evaluation. Defaults to the template for the task and language.
+            num_few_shot_examples (optional):
+                The number of examples to use when benchmarking the dataset using
+                few-shot evaluation. For a classification task, these will be drawn
+                evenly from each label. Defaults to the template for the task and
+                language.
+            max_generated_tokens (optional):
+                The maximum number of tokens to generate when benchmarking the dataset
+                using few-shot evaluation. Defaults to the template for the task and
+                language.
+            labels (optional):
+                The labels in the dataset. Defaults to the template for the task and
+                language.
+            prompt_label_mapping (optional):
+                A mapping from the labels to another phrase which is used as a
+                substitute for the label in few-shot evaluation. If "auto" then the
+                mapping will be set to a 1:1 mapping between the labels and themselves.
+                If None then the mapping will be set to the default mapping for the task
+                and language. Defaults to None.
+            allowed_model_types (optional):
+                A list of model types that are allowed to be evaluated on this dataset.
+                Defaults to the one for the task.
+            allowed_generative_types (optional):
+                A list of generative model types that are allowed to be evaluated on
+                this dataset. If None, all generative model types are allowed. Only
+                relevant if `allowed_model_types` includes generative models. Defaults
+                to the one for the task.
+            allow_invalid_model_outputs (optional):
+                Whether to allow invalid model outputs. This is only relevant for
+                generative models on classification tasks, where the model may generate
+                an output which is not one of the allowed labels. If True, the model
+                output will be mapped to the closest valid label. If False, the model
+                output will be considered incorrect and the evaluation will be aborted.
+                Defaults to the one for the task.
+            train_split (optional):
+                The name of the split to use as the training set. Can be None if there
+                is no training split in the dataset. Defaults to "train".
+            val_split (optional):
+                The name of the split to use as the validation set. Can be None if there
+                is no validation split in the dataset. Defaults to "val".
+            test_split (optional):
+                The name of the split to use as the test set. Defaults to "test".
+            bootstrap_samples (optional):
+                Whether to bootstrap the dataset samples. Defaults to True.
+            unofficial (optional):
+                Whether the dataset is unofficial. Defaults to False.
+            _prompt_prefix (optional):
+                This argument is deprecated. Please use `prompt_prefix` instead.
+            _prompt_template (optional):
+                This argument is deprecated. Please use `prompt_template` instead.
+            _instruction_prompt (optional):
+                This argument is deprecated. Please use `instruction_prompt` instead.
+            _num_few_shot_examples (optional):
+                This argument is deprecated. Please use `num_few_shot_examples` instead.
+            _max_generated_tokens (optional):
+                This argument is deprecated. Please use `max_generated_tokens` instead.
+            _labels (optional):
+                This argument is deprecated. Please use `labels` instead.
+            _prompt_label_mapping (optional):
+                This argument is deprecated. Please use `prompt_label_mapping` instead.
+            _allowed_model_types (optional):
+                This argument is deprecated. Please use `allowed_model_types` instead.
+            _allowed_generative_types (optional):
+                This argument is deprecated. Please use `allowed_generative_types`
+                instead.
+            _allow_invalid_model_outputs (optional):
+                This argument is deprecated. Please use `allow_invalid_model_outputs`
+                instead.
+            _logging_string (optional):
+                This argument is deprecated. Please use `logging_string` instead.
+        """
+        # Deprecation warnings
+        if _prompt_prefix is not None:
+            log_once(
+                "The `_prompt_prefix` argument is deprecated. Please use "
+                "`prompt_prefix` instead.",
+                level=logging.WARNING,
+            )
+            prompt_prefix = _prompt_prefix
+        if _prompt_template is not None:
+            log_once(
+                "The `_prompt_template` argument is deprecated. Please use "
+                "`prompt_template` instead.",
+                level=logging.WARNING,
+            )
+            prompt_template = _prompt_template
+        if _instruction_prompt is not None:
+            log_once(
+                "The `_instruction_prompt` argument is deprecated. Please use "
+                "`instruction_prompt` instead.",
+                level=logging.WARNING,
+            )
+            instruction_prompt = _instruction_prompt
+        if _num_few_shot_examples is not None:
+            log_once(
+                "The `_num_few_shot_examples` argument is deprecated. Please use "
+                "`num_few_shot_examples` instead.",
+                level=logging.WARNING,
+            )
+            num_few_shot_examples = _num_few_shot_examples
+        if _max_generated_tokens is not None:
+            log_once(
+                "The `_max_generated_tokens` argument is deprecated. Please use "
+                "`max_generated_tokens` instead.",
+                level=logging.WARNING,
+            )
+            max_generated_tokens = _max_generated_tokens
+        if _labels is not None:
+            log_once(
+                "The `_labels` argument is deprecated. Please use `labels` instead.",
+                level=logging.WARNING,
+            )
+            labels = _labels
+        if _prompt_label_mapping is not None:
+            log_once(
+                "The `_prompt_label_mapping` argument is deprecated. Please use "
+                "`prompt_label_mapping` instead.",
+                level=logging.WARNING,
+            )
+            prompt_label_mapping = _prompt_label_mapping
+        if _allowed_model_types is not None:
+            log_once(
+                "The `_allowed_model_types` argument is deprecated. Please use "
+                "`allowed_model_types` instead.",
+                level=logging.WARNING,
+            )
+            allowed_model_types = _allowed_model_types
+        if _allowed_generative_types is not None:
+            log_once(
+                "The `_allowed_generative_types` argument is deprecated. Please use "
+                "`allowed_generative_types` instead.",
+                level=logging.WARNING,
+            )
+            allowed_generative_types = _allowed_generative_types
+        if _allow_invalid_model_outputs is not None:
+            log_once(
+                "The `_allow_invalid_model_outputs` argument is deprecated. Please use "
+                "`allow_invalid_model_outputs` instead.",
+                level=logging.WARNING,
+            )
+            allow_invalid_model_outputs = _allow_invalid_model_outputs
+        if _logging_string is not None:
+            log_once(
+                "The `_logging_string` argument is deprecated and is not used anymore. "
+                "Using it will have no effect.",
+                level=logging.WARNING,
+            )
-    name: str
-    pretty_name: str
-    source: str | dict[str, str]
-    task: Task
-    languages: c.Sequence[Language]
-    _prompt_prefix: str | None = None
-    _prompt_template: str | None = None
-    _instruction_prompt: str | None = None
-    _num_few_shot_examples: int | None = None
-    _max_generated_tokens: int | None = None
-    _labels: c.Sequence[str] | None = None
-    _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
-    _allowed_model_types: c.Sequence[ModelType] | None = None
-    _allowed_generative_types: c.Sequence[GenerativeType] | None = None
-    _allow_invalid_model_outputs: bool | None = None
-    _logging_string: str | None = None
-    splits: c.Sequence[str] = field(default_factory=lambda: ["train", "val", "test"])
-    bootstrap_samples: bool = True
-    unofficial: bool = False
+        self._name = name
+        self._pretty_name = pretty_name
+        self._source = source
+        self.task = task
+        self.languages = languages
+        template = self.task.template_dict.get(self.main_language)
+        self.prompt_prefix = (
+            prompt_prefix
+            if prompt_prefix is not None
+            else template.default_prompt_prefix
+            if template is not None
+            else ""
+        )
+        self.prompt_template = (
+            prompt_template
+            if prompt_template is not None
+            else template.default_prompt_template
+            if template is not None
+            else ""
+        )
+        self.instruction_prompt = (
+            instruction_prompt
+            if instruction_prompt is not None
+            else template.default_instruction_prompt
+            if template is not None
+            else ""
+        )
+        self.num_few_shot_examples = (
+            num_few_shot_examples
+            if num_few_shot_examples is not None
+            else self.task.default_num_few_shot_examples
+        )
+        self.max_generated_tokens = (
+            max_generated_tokens
+            if max_generated_tokens is not None
+            else self.task.default_max_generated_tokens
+        )
+        self.labels = (
+            labels if labels is not None else self.task.default_labels or list()
+        )
+        if prompt_label_mapping is None:
+            prompt_label_mapping = (
+                template.default_prompt_label_mapping
+                if template is not None
+                else dict()
+            )
+        self.prompt_label_mapping = (
+            {label: label for label in self.labels}
+            if prompt_label_mapping == "auto"
+            else prompt_label_mapping
+        )
+        self.allowed_model_types = (
+            allowed_model_types
+            if allowed_model_types is not None
+            else self.task.default_allowed_model_types
+        )
+        self.allowed_generative_types = (
+            allowed_generative_types
+            if allowed_generative_types is not None
+            else self.task.default_allowed_generative_types
+        )
+        self.allow_invalid_model_outputs = (
+            allow_invalid_model_outputs
+            if allow_invalid_model_outputs is not None
+            else self.task.default_allow_invalid_model_outputs
+        )
+        self.train_split = train_split
+        self.val_split = val_split
+        self.test_split = test_split
+        self.bootstrap_samples = bootstrap_samples
+        self.unofficial = unofficial
     @property
-    def main_language(self) -> Language:
-        """Get the main language of the dataset.
+    def name(self) -> str:
+        """The name of the dataset.
         Returns:
-            The main language.
+            The name of the dataset.
         """
-        match len(self.languages):
-            case 0:
-                raise InvalidBenchmark(
-                    f"Dataset {self.name!r} must have at least one language."
-                )
-            case 1:
-                return self.languages[0]
-            case _:
-                if ENGLISH in self.languages:
-                    return ENGLISH
-                elif NORWEGIAN in self.languages:
-                    return NORWEGIAN
-                elif PORTUGUESE in self.languages:
-                    return PORTUGUESE
-                else:
-                    return self.languages[0]
+        if self._name is None:
+            raise ValueError("The name of the dataset is not set!")
+        return self._name
+    @name.setter
+    def name(self, value: str) -> None:
+        """Set the name of the dataset.
+        Args:
+            value:
+                The new name of the dataset.
+        """
+        self._name = value
+    @property
+    def pretty_name(self) -> str:
+        """The pretty name of the dataset.
+        Returns:
+            The pretty name of the dataset.
+        """
+        if self._pretty_name is None:
+            raise ValueError("The pretty name of the dataset is not set!")
+        return self._pretty_name
+    @pretty_name.setter
+    def pretty_name(self, value: str) -> None:
+        """Set the pretty name of the dataset.
+        Args:
+            value:
+                The new pretty name of the dataset.
+        """
+        self._pretty_name = value
+    @property
+    def source(self) -> str | dict[str, str]:
+        """The source of the dataset.
+        Returns:
+            The source of the dataset.
+        """
+        if self._source is None:
+            raise ValueError("The source of the dataset is not set!")
+        return self._source
+    @source.setter
+    def source(self, value: str | dict[str, str]) -> None:
+        """Set the source of the dataset.
+        Args:
+            value:
+                The new source of the dataset.
+        """
+        self._source = value
     @property
     def logging_string(self) -> str:
-        """The string used to describe evaluation on the dataset in logging."""
-        if self._logging_string is not None:
-            return self._logging_string
+        """The string used to describe evaluation on the dataset in logging.
+        Returns:
+            The logging string.
+        """
         truncated_str = (
             "truncated version of the "
             if isinstance(self.source, str) and self.source.endswith("-mini")
@@ -293,126 +543,48 @@ class DatasetConfig:
             if PORTUGUESE in self.languages and EUROPEAN_PORTUGUESE in self.languages:
                 logging_languages.remove(EUROPEAN_PORTUGUESE)
-        if len(logging_languages) > 1:
+        if len(logging_languages) > MAX_NUMBER_OF_LOGGING_LANGUAGES:
+            languages_str = ""
+        elif len(logging_languages) > 1:
             languages_str = (
                 ", ".join([lang.name for lang in logging_languages[:-1]])
                 + f" and {logging_languages[-1].name}"
+                + " "
             )
         else:
-            languages_str = logging_languages[0].name
+            languages_str = logging_languages[0].name + " "
         task_str = self.task.name.replace("-", " ")
         dataset_name_str = (
             self.pretty_name or self.name.replace("-", " ").replace("_", " ").title()
         )
         return (
-            f"the {truncated_str}{languages_str} {task_str} dataset {dataset_name_str}"
-        )
-    @property
-    def prompt_prefix(self) -> str:
-        """The prefix to use in the few-shot prompt."""
-        prompt_config = self.task.template_dict[self.main_language]
-        prompt_prefix = (
-            prompt_config.default_prompt_prefix
-            if self._prompt_prefix is None
-            else self._prompt_prefix
-        )
-        return prompt_prefix
-    @property
-    def prompt_template(self) -> str:
-        """The template used during few-shot evaluation."""
-        prompt_config = self.task.template_dict[self.main_language]
-        prompt_template = (
-            prompt_config.default_prompt_template
-            if self._prompt_template is None
-            else self._prompt_template
-        )
-        return prompt_template
-    @property
-    def instruction_prompt(self) -> str:
-        """The prompt to use when evaluating instruction-tuned models."""
-        prompt_config = self.task.template_dict[self.main_language]
-        instruction_prompt = (
-            prompt_config.default_instruction_prompt
-            if self._instruction_prompt is None
-            else self._instruction_prompt
-        )
-        return instruction_prompt
-    @property
-    def num_few_shot_examples(self) -> int:
-        """The number of few-shot examples to use."""
-        return (
-            self._num_few_shot_examples
-            if self._num_few_shot_examples is not None
-            else self.task.default_num_few_shot_examples
-        )
-    @property
-    def max_generated_tokens(self) -> int:
-        """The maximum number of tokens to generate when evaluating a model."""
-        return (
-            self._max_generated_tokens
-            if self._max_generated_tokens is not None
-            else self.task.default_max_generated_tokens
-        )
-    @property
-    def labels(self) -> c.Sequence[str]:
-        """The labels in the dataset."""
-        if self._labels is not None:
-            return self._labels
-        elif self.task.default_labels is not None:
-            return self.task.default_labels
-        else:
-            raise ValueError(
-                f"Labels must be specified for dataset {self.name!r} with the "
-                f"attribute `_labels`, as the task {self.task.name!r} does not have "
-                "default labels."
-            )
-    @property
-    def prompt_label_mapping(self) -> dict[str, str]:
-        """Mapping from English labels to localised labels."""
-        if self._prompt_label_mapping == "auto":
-            return {label: label for label in self.labels}
-        elif self._prompt_label_mapping is not None:
-            return self._prompt_label_mapping
-        prompt_config = self.task.template_dict[self.main_language]
-        if prompt_config.default_prompt_label_mapping == "auto":
-            return {label: label for label in self.labels}
-        else:
-            return prompt_config.default_prompt_label_mapping
-    @property
-    def allowed_model_types(self) -> c.Sequence[ModelType]:
-        """A list of model types that are allowed to be evaluated on this dataset."""
-        return (
-            self._allowed_model_types
-            if self._allowed_model_types is not None
-            else self.task.default_allowed_model_types
+            f"the {truncated_str}{languages_str}{task_str} dataset {dataset_name_str}"
         )
     @property
-    def allowed_generative_types(self) -> c.Sequence[GenerativeType]:
-        """A list of generative model types that are allowed on this dataset."""
-        return (
-            self._allowed_generative_types
-            if self._allowed_generative_types is not None
-            else self.task.default_allowed_generative_types
-        )
+    def main_language(self) -> Language:
+        """Get the main language of the dataset.
-    @property
-    def allow_invalid_model_outputs(self) -> bool:
-        """Whether to allow invalid model outputs."""
-        return (
-            self._allow_invalid_model_outputs
-            if self._allow_invalid_model_outputs is not None
-            else self.task.default_allow_invalid_model_outputs
-        )
+        Returns:
+            The main language.
+        """
+        match len(self.languages):
+            case 0:
+                raise InvalidBenchmark(
+                    f"Dataset {self.name!r} must have at least one language."
+                )
+            case 1:
+                return self.languages[0]
+            case _:
+                if ENGLISH in self.languages:
+                    return ENGLISH
+                elif NORWEGIAN in self.languages:
+                    return NORWEGIAN
+                elif PORTUGUESE in self.languages:
+                    return PORTUGUESE
+                else:
+                    return self.languages[0]
     @property
     def id2label(self) -> "HashableDict":
@@ -517,6 +689,9 @@ class BenchmarkConfig:
             faster evaluation, but at the risk of running out of GPU memory. Only reduce
             this if you are running out of GPU memory. Only relevant if the model is
             generative.
+        attention_backend:
+            The attention backend to use for vLLM. Defaults to FLASHINFER. Only
+            relevant if the model is generative.
         requires_safetensors:
             Whether to only allow models that use the safetensors format.
         generative_type:
@@ -553,6 +728,9 @@ class BenchmarkConfig:
     few_shot: bool
     num_iterations: int
     gpu_memory_utilization: float
+    attention_backend: t.Literal[
+        *ATTENTION_BACKENDS  # pyrefly: ignore[invalid-literal]
+    ]
     requires_safetensors: bool
     generative_type: GenerativeType | None
     download_only: bool
@@ -601,6 +779,9 @@ class BenchmarkConfigParams(pydantic.BaseModel):
     requires_safetensors: bool
     download_only: bool
     gpu_memory_utilization: float
+    attention_backend: t.Literal[
+        *ATTENTION_BACKENDS  # pyrefly: ignore[invalid-literal]
+    ]
     generative_type: GenerativeType | None
     custom_datasets_file: Path
     force: bool

ScandEval 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl

ScandEval 16.11.0py3-none-any.whl → 16.13.0py3-none-any.whl