PyPI - EuroEval - Versions diffs - 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl - Mend

EuroEval 15.16.0py3-none-any.whl → 16.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show

euroeval/__init__.py +8 -7
euroeval/benchmark_config_factory.py +3 -7
euroeval/benchmark_modules/base.py +35 -19
euroeval/benchmark_modules/fresh.py +24 -19
euroeval/benchmark_modules/hf.py +136 -154
euroeval/benchmark_modules/litellm.py +190 -110
euroeval/benchmark_modules/vllm.py +199 -139
euroeval/benchmarker.py +49 -22
euroeval/cli.py +3 -3
euroeval/constants.py +19 -15
euroeval/data_loading.py +33 -28
euroeval/data_models.py +73 -23
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/danish.py +35 -1
euroeval/dataset_configs/dutch.py +38 -1
euroeval/dataset_configs/english.py +38 -1
euroeval/dataset_configs/estonian.py +95 -0
euroeval/dataset_configs/faroese.py +38 -0
euroeval/dataset_configs/finnish.py +39 -1
euroeval/dataset_configs/french.py +38 -1
euroeval/dataset_configs/german.py +38 -1
euroeval/dataset_configs/icelandic.py +39 -1
euroeval/dataset_configs/italian.py +38 -1
euroeval/dataset_configs/latvian.py +81 -0
euroeval/dataset_configs/norwegian.py +38 -1
euroeval/dataset_configs/portuguese.py +38 -1
euroeval/dataset_configs/spanish.py +38 -1
euroeval/dataset_configs/swedish.py +38 -1
euroeval/enums.py +0 -6
euroeval/finetuning.py +6 -6
euroeval/generation.py +25 -14
euroeval/generation_utils.py +90 -20
euroeval/languages.py +947 -187
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +76 -0
euroeval/metrics/huggingface.py +192 -0
euroeval/metrics/llm_as_a_judge.py +257 -0
euroeval/metrics/pipeline.py +276 -0
euroeval/metrics/speed.py +51 -0
euroeval/model_cache.py +13 -1
euroeval/prompt_templates/linguistic_acceptability.py +40 -2
euroeval/prompt_templates/multiple_choice.py +23 -2
euroeval/prompt_templates/named_entity_recognition.py +65 -2
euroeval/prompt_templates/reading_comprehension.py +42 -2
euroeval/prompt_templates/sentiment_classification.py +46 -2
euroeval/prompt_templates/summarization.py +24 -4
euroeval/scores.py +7 -2
euroeval/speed_benchmark.py +6 -6
euroeval/task_group_utils/multiple_choice_classification.py +19 -8
euroeval/task_group_utils/question_answering.py +35 -28
euroeval/task_group_utils/sequence_classification.py +128 -42
euroeval/task_group_utils/text_to_text.py +7 -3
euroeval/task_group_utils/token_classification.py +59 -73
euroeval/tasks.py +33 -6
euroeval/tokenization_utils.py +294 -207
euroeval/utils.py +150 -35
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
euroeval-16.0.1.dist-info/RECORD +69 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -738
euroeval/metrics.py +0 -470
euroeval-15.16.0.dist-info/RECORD +0 -63
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmarker.py CHANGED Viewed

@@ -15,7 +15,7 @@ from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
 from torch.distributed import destroy_process_group
 from .benchmark_config_factory import build_benchmark_config
-from .constants import GENERATIVE_DATASET_TASK_GROUPS, GENERATIVE_PIPELINE_TAGS
+from .constants import GENERATIVE_PIPELINE_TAGS
 from .data_loading import load_data
 from .data_models import BenchmarkConfigParams, BenchmarkResult
 from .dataset_configs import get_all_dataset_configs
@@ -81,7 +81,7 @@ class Benchmarker:
         gpu_memory_utilization: float = 0.9,
         debug: bool = False,
         run_with_cli: bool = False,
-        only_allow_safetensors: bool = False,
+        requires_safetensors: bool = False,
     ) -> None:
         """Initialise the benchmarker.
@@ -156,7 +156,7 @@ class Benchmarker:
             run_with_cli:
                 Whether the benchmarker is being run from the command-line interface.
                 Defaults to False.
-            only_allow_safetensors:
+            requires_safetensors:
                 Whether to only allow models that use the safetensors format. Defaults
                 to False.
@@ -201,11 +201,11 @@ class Benchmarker:
             gpu_memory_utilization=gpu_memory_utilization,
             debug=debug,
             run_with_cli=run_with_cli,
-            only_allow_safetensors=only_allow_safetensors,
+            requires_safetensors=requires_safetensors,
         )
         self.benchmark_config = build_benchmark_config(
-            first_time=True, **self.benchmark_config_default_params.model_dump()
+            **self.benchmark_config_default_params.model_dump()
         )
         # Initialise variable storing model lists, so we only have to fetch it once
@@ -249,7 +249,7 @@ class Benchmarker:
         evaluate_test_split: bool | None = None,
         few_shot: bool | None = None,
         num_iterations: int | None = None,
-        only_allow_safetensors: bool | None = None,
+        requires_safetensors: bool | None = None,
     ) -> list[BenchmarkResult]:
         """Benchmarks models on datasets.
@@ -327,7 +327,7 @@ class Benchmarker:
                 to be used for power users, and scores will not be allowed on the
                 leaderboards if this is changed. Defaults to the value specified when
                 initialising the benchmarker.
-            only_allow_safetensors:
+            requires_safetensors:
                 Whether to only allow models that use the safetensors format. Defaults
                 to the value specified when initialising the benchmarker.
@@ -361,7 +361,7 @@ class Benchmarker:
             evaluate_test_split=evaluate_test_split,
             few_shot=few_shot,
             num_iterations=num_iterations,
-            only_allow_safetensors=only_allow_safetensors,
+            requires_safetensors=requires_safetensors,
         )
         adjust_logging_level(verbose=benchmark_config.verbose)
@@ -390,7 +390,35 @@ class Benchmarker:
                 continue
             loaded_model: BenchmarkModule | None = None
+            benchmark_params_to_revert: dict[str, t.Any] = dict()
             for dataset_config in dataset_configs:
+                # Revert any changes to the benchmark configuration made for the
+                # previous dataset
+                for param, value in benchmark_params_to_revert.items():
+                    setattr(benchmark_config, param, value)
+                benchmark_params_to_revert = dict()
+                # Update the benchmark config if the dataset requires it
+                if (
+                    "val" not in dataset_config.splits
+                    and not benchmark_config.evaluate_test_split
+                ):
+                    logger.debug(
+                        "The dataset does not have a validation split, so even though "
+                        "you requested evaluating the validation split (the default), "
+                        "we will evaluate on the test split."
+                    )
+                    benchmark_params_to_revert["evaluate_test_split"] = False
+                    benchmark_config.evaluate_test_split = True
+                if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
+                    logger.debug(
+                        "The task requires zero-shot evaluation, so even though you "
+                        "requested few-shot evaluation (the default), we will evaluate "
+                        "zero-shot."
+                    )
+                    benchmark_params_to_revert["few_shot"] = True
+                    benchmark_config.few_shot = False
                 # Skip if we have already benchmarked this model on this dataset and
                 # we are not forcing the benchmark
                 if not benchmark_config.force and model_has_been_benchmarked(
@@ -408,15 +436,14 @@ class Benchmarker:
                     num_finished_benchmarks += 1
                     continue
-                # Skip if the model is an encoder model and the task is generative
-                task_is_generative = (
-                    dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
-                )
-                if model_config.model_type == ModelType.ENCODER and task_is_generative:
+                # Skip if the model type should not be benchmarked on this dataset
+                model_type = model_config.model_type
+                allowed_model_types = dataset_config.task.allowed_model_types
+                if model_type not in allowed_model_types:
                     logger.debug(
                         f"Skipping benchmarking {model_id} on "
-                        f"{dataset_config.pretty_name}, as it is an encoder model and "
-                        "the task is generative."
+                        f"{dataset_config.pretty_name}, as it is of type {model_type}, "
+                        f"and the only allowed model types are {allowed_model_types}."
                     )
                     continue
@@ -535,7 +562,7 @@ class Benchmarker:
         api_version: str | None | None = None,
         debug: bool | None = None,
         run_with_cli: bool | None = None,
-        only_allow_safetensors: bool | None = None,
+        requires_safetensors: bool | None = None,
     ) -> "BenchmarkConfig":
         """Get an updated benchmark configuration.
@@ -609,7 +636,7 @@ class Benchmarker:
             run_with_cli:
                 Whether the benchmarker is being run from the command-line interface.
                 If None, then this value will not be updated.
-            only_allow_safetensors:
+            requires_safetensors:
                 Whether to only allow models that use the safetensors format. If None,
                 then this value will not be updated.
@@ -666,8 +693,8 @@ class Benchmarker:
             benchmark_config_params.debug = debug
         if run_with_cli is not None:
             benchmark_config_params.run_with_cli = run_with_cli
-        if only_allow_safetensors is not None:
-            benchmark_config_params.only_allow_safetensors = only_allow_safetensors
+        if requires_safetensors is not None:
+            benchmark_config_params.requires_safetensors = requires_safetensors
         return build_benchmark_config(**benchmark_config_params.model_dump())
@@ -857,7 +884,7 @@ class Benchmarker:
         evaluate_test_split: bool | None = None,
         few_shot: bool | None = None,
         num_iterations: int | None = None,
-        only_allow_safetensors: bool | None = None,
+        requires_safetensors: bool | None = None,
     ) -> list[BenchmarkResult]:
         """Benchmarks models on datasets.
@@ -935,7 +962,7 @@ class Benchmarker:
                 to be used for power users, and scores will not be allowed on the
                 leaderboards if this is changed. Defaults to the value specified when
                 initialising the benchmarker.
-            only_allow_safetensors:
+            requires_safetensors:
                 Whether to only allow models that use the safetensors format. Defaults
                 to the value specified when initialising the benchmarker.
@@ -971,7 +998,7 @@ class Benchmarker:
             evaluate_test_split=evaluate_test_split,
             few_shot=few_shot,
             num_iterations=num_iterations,
-            only_allow_safetensors=only_allow_safetensors,
+            requires_safetensors=requires_safetensors,
         )

euroeval/cli.py CHANGED Viewed

@@ -203,7 +203,7 @@ from .tasks import get_all_tasks
     "relevant if the model is generative.",
 )
 @click.option(
-    "--only-allow-safetensors",
+    "--requires-safetensors",
     is_flag=True,
     help="Only allow loading models that have safetensors weights available",
     default=False,
@@ -233,7 +233,7 @@ def benchmark(
     api_version: str | None,
     gpu_memory_utilization: float,
     debug: bool,
-    only_allow_safetensors: bool,
+    requires_safetensors: bool,
 ) -> None:
     """Benchmark pretrained language models on language tasks."""
     models = list(model)
@@ -270,7 +270,7 @@ def benchmark(
         gpu_memory_utilization=gpu_memory_utilization,
         debug=debug,
         run_with_cli=True,
-        only_allow_safetensors=only_allow_safetensors,
+        requires_safetensors=requires_safetensors,
     )
     # Perform the benchmark evaluation

euroeval/constants.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Constants used throughout the project."""
 from .enums import TaskGroup
-from .tasks import NER
 # This is used as input to generative models; it cannot be a special token
 DUMMY_FILL_VALUE = 100
@@ -11,7 +10,7 @@ DUMMY_FILL_VALUE = 100
 # benchmark. We will still report the models' true maximum context length in the
 # metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
 # all tokens in the context.
-MAX_CONTEXT_LENGTH = 5_000
+MAX_CONTEXT_LENGTH = 8_192
 # We need to raise the amount of tokens generated for reasoning models, to give them
@@ -37,21 +36,10 @@ GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
 LOCAL_MODELS_REQUIRED_FILES = ["config.json"]
-# Tasks where we use structured generation for generative models
-TASKS_USING_JSON = [NER]
-# Tasks where we use log probabilities for generative models, rather than the raw
-# completion
-TASK_GROUPS_USING_LOGPROBS = [
-    TaskGroup.SEQUENCE_CLASSIFICATION,
-    TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
-]
 # The number of top log probabilities to return for generative models. For several APIs
 # this is the maximum number of log probabilities that can be returned
-MAX_LOGPROBS = 8
+MAX_VLLM_LOGPROBS = 20
+MAX_LITELLM_LOGPROBS = 8
 # We make sure to remove these metric attributes after each iteration, to avoid memory
@@ -77,3 +65,19 @@ REASONING_TOKENS = [
 # manually. We only use them as stop tokens if they actually appear in the model's
 # output
 CUSTOM_STOP_TOKENS = ["<sep>"]
+# For classification tasks we force LiteLLM models to output a JSON dictionary with a
+# single key and the values being restricted to the allowed labels. This is the key we
+# use
+LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
+# These characters are stripped from JSON output when trying to identify the label
+JSON_STRIP_CHARACTERS = ' {}\n\r":'
+# The number of tokens we generate when evaluating generative models on classification
+# tasks. We also use this to determine whether we should store logprobs in the model
+# outputs (and cache).
+NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10

euroeval/data_loading.py CHANGED Viewed

@@ -12,6 +12,7 @@ from huggingface_hub.errors import HfHubHTTPError
 from numpy.random import Generator
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark
+from .tasks import EUROPEAN_VALUES
 from .utils import unscramble
 if t.TYPE_CHECKING:
@@ -48,40 +49,45 @@ def load_data(
         dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
     )
-    if not benchmark_config.evaluate_test_split:
+    if not benchmark_config.evaluate_test_split and "val" in dataset:
         dataset["test"] = dataset["val"]
     # Remove empty examples from the datasets
     for text_feature in ["tokens", "text"]:
-        if text_feature in dataset["train"].features:
-            dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
+        for split in dataset_config.splits:
+            if text_feature in dataset[split].features:
+                dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
-    # If we are testing then truncate the test set
-    if hasattr(sys, "_called_from_test"):
+    # If we are testing then truncate the test set, unless we need the full set for
+    # evaluation
+    if hasattr(sys, "_called_from_test") and dataset_config.task != EUROPEAN_VALUES:
         dataset["test"] = dataset["test"].select(range(1))
-    # Bootstrap the splits
-    bootstrapped_splits: dict[str, list["Dataset"]] = dict()
-    for split in ["train", "val", "test"]:
-        bootstrap_indices = rng.integers(
-            0,
-            len(dataset[split]),
-            size=(benchmark_config.num_iterations, len(dataset[split])),
-        )
-        bootstrapped_splits[split] = [
-            dataset[split].select(bootstrap_indices[idx])
+    # Bootstrap the splits, if applicable
+    if dataset_config.bootstrap_samples:
+        bootstrapped_splits: dict[str, list["Dataset"]] = dict()
+        for split in dataset_config.splits:
+            bootstrap_indices = rng.integers(
+                0,
+                len(dataset[split]),
+                size=(benchmark_config.num_iterations, len(dataset[split])),
+            )
+            bootstrapped_splits[split] = [
+                dataset[split].select(bootstrap_indices[idx])
+                for idx in range(benchmark_config.num_iterations)
+            ]
+        datasets = [
+            DatasetDict(
+                {
+                    split: bootstrapped_splits[split][idx]
+                    for split in dataset_config.splits
+                }
+            )
             for idx in range(benchmark_config.num_iterations)
         ]
+    else:
+        datasets = [dataset] * benchmark_config.num_iterations
-    datasets = [
-        DatasetDict(
-            {
-                split: bootstrapped_splits[split][idx]
-                for split in ["train", "val", "test"]
-            }
-        )
-        for idx in range(benchmark_config.num_iterations)
-    ]
     return datasets
@@ -113,7 +119,7 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
             requests.ConnectionError,
             requests.ReadTimeout,
         ):
-            logger.warning(
+            logger.debug(
                 f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
             )
             time.sleep(1)
@@ -126,11 +132,10 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
             f"{num_attempts} attempts."
         )
     assert isinstance(dataset, DatasetDict)  # type: ignore[used-before-def]
-    required_keys = ["train", "val", "test"]
-    missing_keys = [key for key in required_keys if key not in dataset]
+    missing_keys = [key for key in dataset_config.splits if key not in dataset]
     if missing_keys:
         raise InvalidBenchmark(
             "The dataset is missing the following required splits: "
             f"{', '.join(missing_keys)}"
         )
-    return DatasetDict({key: dataset[key] for key in required_keys})
+    return DatasetDict({key: dataset[key] for key in dataset_config.splits})

euroeval/data_models.py CHANGED Viewed

@@ -9,11 +9,14 @@ from dataclasses import dataclass, field
 import pydantic
 import torch
-from .enums import Device, InferenceBackend, ModelType, TaskGroup
-from .metrics import Metric
+from .enums import Device, GenerativeType, ModelType, TaskGroup
 from .types import ScoreDict
 from .utils import get_package_version
+if t.TYPE_CHECKING:
+    from .enums import InferenceBackend
+    from .metrics import Metric
 @dataclass
 class Language:
@@ -104,15 +107,58 @@ class Task:
             using few-shot evaluation.
         default_labels:
             The default labels for datasets using this task.
+        requires_zero_shot (optional):
+            Whether to only allow zero-shot evaluation for this task. If True, the
+            task will not be evaluated using few-shot examples.
+        uses_structured_output (optional):
+            Whether the task uses structured output. If True, the task will return
+            structured output (e.g., BIO tags for NER). Defaults to False.
+        uses_logprobs (optional):
+            Whether the task uses log probabilities. If True, the task will return
+            log probabilities for the generated tokens. Defaults to False.
+        requires_logprobs (optional):
+            Whether the task requires log probabilities. Implies `uses_logprobs`.
+        allowed_model_types (optional):
+            A list of model types that are allowed to be evaluated on this task.
+            Defaults to all model types being allowed.
+        allowed_generative_types (optional):
+            A list of generative model types that are allowed to be evaluated on this
+            task. If None, all generative model types are allowed. Only relevant if
+            `allowed_model_types` includes generative models.
+        allow_invalid_model_outputs (optional):
+            Whether to allow invalid model outputs. This is only relevant for generative
+            models on classification tasks, where the model may generate an output
+            which is not one of the allowed labels. If True, the model output will be
+            mapped to the closest valid label. If False, the model output will be
+            considered incorrect and the evaluation will be aborted. Defaults to True.
     """
     name: str
     task_group: TaskGroup
     template_dict: dict["Language", "PromptConfig"]
-    metrics: list[Metric]
+    metrics: list["Metric"]
     default_num_few_shot_examples: int
     default_max_generated_tokens: int
     default_labels: list[str]
+    requires_zero_shot: bool = False
+    uses_structured_output: bool = False
+    uses_logprobs: bool = False
+    requires_logprobs: bool = False
+    allowed_model_types: list[ModelType] = field(
+        default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
+    )
+    allowed_generative_types: list[GenerativeType] = field(
+        default_factory=lambda: [
+            GenerativeType.BASE,
+            GenerativeType.INSTRUCTION_TUNED,
+            GenerativeType.REASONING,
+        ]
+    )
+    allow_invalid_model_outputs: bool = True
+    def __post_init__(self) -> None:
+        """Post-initialisation checks."""
+        self.uses_logprobs = self.uses_logprobs or self.requires_logprobs
     def __hash__(self) -> int:
         """Return a hash of the task."""
@@ -177,7 +223,7 @@ class BenchmarkConfig:
             Whether to run the benchmark in debug mode.
         run_with_cli:
             Whether the benchmark is being run with the CLI.
-        only_allow_safetensors:
+        requires_safetensors:
             Whether to only allow models that use the safetensors format.
     """
@@ -204,7 +250,7 @@ class BenchmarkConfig:
     gpu_memory_utilization: float
     debug: bool
     run_with_cli: bool
-    only_allow_safetensors: bool
+    requires_safetensors: bool
 class BenchmarkConfigParams(pydantic.BaseModel):
@@ -236,7 +282,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
     gpu_memory_utilization: float
     debug: bool
     run_with_cli: bool
-    only_allow_safetensors: bool
+    requires_safetensors: bool
 class BenchmarkResult(pydantic.BaseModel):
@@ -356,6 +402,11 @@ class DatasetConfig:
             to a 1:1 mapping between the labels and themselves. If None then the mapping
             will be set to the default mapping for the task and language. Defaults to
             None.
+        splits (optional):
+            The names of the splits in the dataset. If not provided, defaults to
+            ["train", "val", "test"].
+        bootstrap_samples (optional):
+            Whether to bootstrap the dataset samples. Defaults to True.
         unofficial (optional):
             Whether the dataset is unofficial. Defaults to False.
     """
@@ -372,6 +423,8 @@ class DatasetConfig:
     _max_generated_tokens: int | None = None
     _labels: list[str] | None = None
     _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
+    splits: list[str] = field(default_factory=lambda: ["train", "val", "test"])
+    bootstrap_samples: bool = True
     unofficial: bool = False
     @property
@@ -384,7 +437,6 @@ class DatasetConfig:
             if self._prompt_prefix is None
             else self._prompt_prefix
         )
-        prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
         return prompt_prefix
     @property
@@ -397,7 +449,6 @@ class DatasetConfig:
             if self._prompt_template is None
             else self._prompt_template
         )
-        prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
         return prompt_template
     @property
@@ -410,9 +461,6 @@ class DatasetConfig:
             if self._instruction_prompt is None
             else self._instruction_prompt
         )
-        instruction_prompt = instruction_prompt.replace(
-            "{labels_str}", self._labels_str
-        )
         return instruction_prompt
     @property
@@ -473,15 +521,16 @@ class DatasetConfig:
         """Return a hash of the dataset configuration."""
         return hash(self.name)
-    @property
-    def _labels_str(self) -> str:
+    def get_labels_str(self, labels: list[str] | None = None) -> str:
         """Converts a set of labels to a natural string, in the specified language.
         If the task is NER, we separate using 'and' and use the mapped labels instead of
         the BIO NER labels.
         Args:
-            language: The language to be used when converting the labels.
+            labels (optional):
+                The labels to convert to a natural string. If None, uses all the labels
+                in the dataset. Defaults to None.
         Returns:
             The natural string representation of the labels in specified language.
@@ -493,16 +542,17 @@ class DatasetConfig:
         else:
             sep_word = main_language.or_separator
-        local_labels: list[str] = []
-        for label in self.labels:
-            if label not in self.prompt_label_mapping:
-                continue
-            local_label = self.prompt_label_mapping[label]
-            if local_label not in local_labels:
-                local_labels.append(local_label)
+        if labels is None:
+            labels = list()
+            for english_label in self.labels:
+                if english_label not in self.prompt_label_mapping:
+                    continue
+                label = self.prompt_label_mapping[english_label]
+                if label not in labels:
+                    labels.append(label)
         # Convert labels to single-quoted labels - and remove duplicates
-        quoted_labels = [f"'{label}'" for label in local_labels]
+        quoted_labels = [f"'{label}'" for label in labels]
         if not quoted_labels:
             return ""
@@ -546,7 +596,7 @@ class ModelConfig:
     revision: str
     task: str
     languages: list[Language]
-    inference_backend: InferenceBackend
+    inference_backend: "InferenceBackend"
     merge: bool
     model_type: ModelType
     fresh: bool

euroeval/dataset_configs/__init__.py CHANGED Viewed

@@ -6,12 +6,14 @@ from ..tasks import SPEED
 from .danish import *  # noqa: F403
 from .dutch import *  # noqa: F403
 from .english import *  # noqa: F403
+from .estonian import *  # noqa: F403
 from .faroese import *  # noqa: F403
 from .finnish import *  # noqa: F403
 from .french import *  # noqa: F403
 from .german import *  # noqa: F403
 from .icelandic import *  # noqa: F403
 from .italian import *  # noqa: F403
+from .latvian import *  # noqa: F403
 from .norwegian import *  # noqa: F403
 from .portuguese import *  # noqa: F403
 from .spanish import *  # noqa: F403

euroeval/dataset_configs/danish.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from ..data_models import DatasetConfig
 from ..languages import DA
-from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
+from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
 ### Official datasets ###
@@ -76,6 +76,16 @@ HELLASWAG_DA_CONFIG = DatasetConfig(
     languages=[DA],
 )
+EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
+    name="european-values-da",
+    pretty_name="the Danish version of the European values evaluation dataset",
+    huggingface_id="EuroEval/european-values-da",
+    task=EUROPEAN_VALUES,
+    languages=[DA],
+    splits=["test"],
+    bootstrap_samples=False,
+)
 ### Unofficial datasets ###
@@ -138,3 +148,27 @@ GOLDENSWAG_DA_CONFIG = DatasetConfig(
     languages=[DA],
     unofficial=True,
 )
+EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
+    name="european-values-situational-da",
+    pretty_name="the Danish version of the European values evaluation dataset, where "
+    "the questions are phrased in a situational way",
+    huggingface_id="EuroEval/european-values-situational-da",
+    task=EUROPEAN_VALUES,
+    languages=[DA],
+    splits=["test"],
+    bootstrap_samples=False,
+    unofficial=True,
+)
+EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
+    name="european-values-completions-da",
+    pretty_name="the Danish version of the European values evaluation dataset, where "
+    "the questions are phrased as sentence completions",
+    huggingface_id="EuroEval/european-values-completions-da",
+    task=EUROPEAN_VALUES,
+    languages=[DA],
+    splits=["test"],
+    bootstrap_samples=False,
+    unofficial=True,
+)

EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl

Potentially problematic release.

EuroEval 15.16.0py3-none-any.whl → 16.0.1py3-none-any.whl