PyPI - EuroEval - Versions diffs - 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl - Mend

EuroEval 16.0.1py3-none-any.whl → 16.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (48) hide show

euroeval/benchmark_config_factory.py +6 -1
euroeval/benchmark_modules/base.py +2 -0
euroeval/benchmark_modules/fresh.py +7 -1
euroeval/benchmark_modules/hf.py +26 -21
euroeval/benchmark_modules/litellm.py +258 -131
euroeval/benchmark_modules/vllm.py +79 -40
euroeval/benchmarker.py +11 -2
euroeval/cli.py +14 -1
euroeval/constants.py +1 -1
euroeval/data_models.py +77 -6
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +14 -0
euroeval/dataset_configs/dutch.py +14 -0
euroeval/dataset_configs/english.py +22 -0
euroeval/dataset_configs/estonian.py +15 -7
euroeval/dataset_configs/finnish.py +14 -0
euroeval/dataset_configs/french.py +14 -0
euroeval/dataset_configs/german.py +23 -0
euroeval/dataset_configs/italian.py +14 -0
euroeval/dataset_configs/latvian.py +14 -0
euroeval/dataset_configs/norwegian.py +14 -0
euroeval/dataset_configs/polish.py +126 -0
euroeval/dataset_configs/portuguese.py +14 -0
euroeval/dataset_configs/spanish.py +14 -0
euroeval/dataset_configs/swedish.py +25 -0
euroeval/enums.py +12 -0
euroeval/generation.py +17 -8
euroeval/generation_utils.py +65 -11
euroeval/metrics/pipeline.py +1 -1
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +27 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +11 -0
euroeval/prompt_templates/sentiment_classification.py +15 -0
euroeval/prompt_templates/summarization.py +27 -1
euroeval/scores.py +5 -0
euroeval/task_group_utils/question_answering.py +29 -29
euroeval/task_group_utils/sequence_classification.py +11 -34
euroeval/task_group_utils/token_classification.py +3 -3
euroeval/tasks.py +4 -4
euroeval/{tokenization_utils.py → tokenisation_utils.py} +50 -28
euroeval/utils.py +36 -3
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/METADATA +1 -1
euroeval-16.1.1.dist-info/RECORD +70 -0
euroeval-16.0.1.dist-info/RECORD +0 -69
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/WHEEL +0 -0
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/entry_points.txt +0 -0
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -44,7 +44,11 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
-from ..generation_utils import apply_prompt, extract_few_shot_examples
+from ..generation_utils import (
+    apply_prompt,
+    extract_few_shot_examples,
+    raise_if_wrong_params,
+)
 from ..languages import get_all_languages
 from ..task_group_utils import (
     question_answering,
@@ -52,7 +56,7 @@ from ..task_group_utils import (
     text_to_text,
     token_classification,
 )
-from ..tokenization_utils import (
+from ..tokenisation_utils import (
     apply_chat_template,
     get_bos_token,
     get_end_of_chat_token_ids,
@@ -69,6 +73,7 @@ from ..utils import (
     get_hf_token,
     get_min_cuda_compute_capability,
     log_once,
+    split_model_id,
 )
 from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
@@ -97,6 +102,7 @@ class VLLMModel(HuggingFaceEncoderModel):
     fresh_model = False
     batching_preference = BatchingPreference.ALL_AT_ONCE
     high_priority = True
+    allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
     def __init__(
         self,
@@ -120,42 +126,46 @@ class VLLMModel(HuggingFaceEncoderModel):
         if importlib.util.find_spec("vllm") is None:
             raise NeedsExtraInstalled(extra="generative")
+        raise_if_wrong_params(
+            model_config=model_config, allowed_params=self.allowed_params
+        )
         model, tokeniser = load_model_and_tokeniser(
             model_config=model_config, benchmark_config=benchmark_config
         )
         self._model: "LLM" = model
         self._tokeniser: "PreTrainedTokenizer" = tokeniser
+        # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
+        # to call the `__init__` method of the `BenchmarkModule` class.
+        super(HuggingFaceEncoderModel, self).__init__(
+            model_config=model_config,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
+            log_metadata=log_metadata,
+        )
         self.end_of_reasoning_token = get_end_of_reasoning_token(
             model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
         )
         self.end_of_chat_token_ids = get_end_of_chat_token_ids(
-            tokeniser=self._tokeniser
+            tokeniser=self._tokeniser, generative_type=self.generative_type
         )
         self.custom_stop_tokens = get_custom_stop_tokens(
             model=self._model,
             tokeniser=self._tokeniser,
             model_id=model_config.model_id,
-            is_reasoning_model=self.end_of_reasoning_token is not None,
-        )
-        # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
-        # to call the `__init__` method of the `BenchmarkModule` class.
-        super(HuggingFaceEncoderModel, self).__init__(
-            model_config=model_config,
-            dataset_config=dataset_config,
-            benchmark_config=benchmark_config,
-            log_metadata=log_metadata,
+            generative_type=self.generative_type,
         )
         self.buffer |= dict(
-            instruction_model=has_chat_template(tokeniser=self._tokeniser),
             first_label_token_mapping=get_first_label_token_mapping(
                 dataset_config=self.dataset_config,
                 model_config=self.model_config,
                 tokeniser=self._tokeniser,
                 generative_type=self.generative_type,
                 log_metadata=self.log_metadata,
-            ),
+            )
         )
         if self.model_config.adapter_base_model_id is not None:
             adapter_path = snapshot_download(
@@ -187,16 +197,36 @@ class VLLMModel(HuggingFaceEncoderModel):
             The generative type of the model, or None if it has not been set yet.
         """
         if not hasattr(self, "_tokeniser"):
+            log_once(
+                "The generative type of the model has not been set yet as the "
+                "tokeniser has not been loaded.",
+                level=logging.DEBUG,
+            )
             return None
-        elif self.end_of_reasoning_token is not None:
-            return GenerativeType.REASONING
+        elif self.benchmark_config.generative_type is not None:
+            type_ = self.benchmark_config.generative_type
+        elif self.model_config.param in {"thinking"}:
+            type_ = GenerativeType.REASONING
+        elif self.model_config.param in {"no-thinking"}:
+            type_ = GenerativeType.INSTRUCTION_TUNED
+        elif (
+            hasattr(self, "end_of_reasoning_token")
+            and self.end_of_reasoning_token is not None
+        ):
+            type_ = GenerativeType.REASONING
         elif (
             has_chat_template(tokeniser=self._tokeniser)
             or "instruct" in self.model_config.model_id.lower()
         ):
-            return GenerativeType.INSTRUCTION_TUNED
+            type_ = GenerativeType.INSTRUCTION_TUNED
         else:
-            return GenerativeType.BASE
+            type_ = GenerativeType.BASE
+        log_once(
+            f"Detected generative type {type_.name!r} for model "
+            f"{self.model_config.model_id!r}",
+            level=logging.DEBUG,
+        )
+        return type_
     @property
     def extract_labels_from_generation(self) -> ExtractLabelsFunction:
@@ -285,7 +315,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                 few_shot_examples=few_shot_examples,
                 model_config=self.model_config,
                 dataset_config=self.dataset_config,
-                instruction_model=self.buffer["instruction_model"],
+                generative_type=self.generative_type,
                 always_populate_text_field=True,
                 tokeniser=self._tokeniser,
             ),
@@ -313,7 +343,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         """
         # Get stopping tokens
         stop_tokens: list[str] = self.custom_stop_tokens.copy()
-        if self.buffer["instruction_model"] is False:
+        if self.generative_type == GenerativeType.BASE:
             stop_tokens.append("\n\n")
         if self._tokeniser.pad_token_id is not None:
             assert isinstance(self._tokeniser.pad_token, str), (
@@ -430,9 +460,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         labels_to_be_generated = list(self.dataset_config.prompt_label_mapping.values())
         if len(labels_to_be_generated) == 0:
             labels_to_be_generated = ["negative", "positive"]
-        if not self.buffer.get(
-            "instruction_model", False
-        ) and should_prompts_be_stripped(
+        if self.generative_type == GenerativeType.BASE and should_prompts_be_stripped(
             labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
         ):
             log_once(
@@ -590,9 +618,10 @@ class VLLMModel(HuggingFaceEncoderModel):
         if using_api:
             return False
-        model_id, revision = (
-            model_id.split("@") if "@" in model_id else (model_id, "main")
-        )
+        model_id_components = split_model_id(model_id=model_id)
+        model_id = model_id_components.model_id
+        revision = model_id_components.revision
         model_info = get_model_repo_info(
             model_id=model_id, revision=revision, benchmark_config=benchmark_config
         )
@@ -616,11 +645,11 @@ class VLLMModel(HuggingFaceEncoderModel):
         Returns:
             The model configuration.
         """
-        model_id, revision = (
-            model_id.split("@") if "@" in model_id else (model_id, "main")
-        )
+        model_id_components = split_model_id(model_id=model_id)
         model_info = get_model_repo_info(
-            model_id=model_id, revision=revision, benchmark_config=benchmark_config
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            benchmark_config=benchmark_config,
         )
         if model_info is None:
             raise InvalidModel(f"The model {model_id!r} could not be found.")
@@ -629,8 +658,9 @@ class VLLMModel(HuggingFaceEncoderModel):
         language_codes = list(language_mapping.keys())
         model_config = ModelConfig(
-            model_id=model_id,
-            revision=revision,
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            param=model_id_components.param,
             task=model_info.pipeline_tag,
             languages=[
                 language_mapping[tag]
@@ -985,7 +1015,11 @@ def get_end_of_reasoning_token(
     prompt = "What is your name?"
     if has_chat_template(tokeniser=tokeniser):
         templated_prompt = apply_chat_template(
-            conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
+            conversation=[dict(role="user", content=prompt)],
+            tokeniser=tokeniser,
+            tokenise=False,
+            add_generation_prompt=True,
+            enable_thinking=True,
         )
         assert isinstance(templated_prompt, str)
         prompt = templated_prompt
@@ -1063,7 +1097,7 @@ def get_custom_stop_tokens(
     model: "LLM",
     tokeniser: "PreTrainedTokenizer",
     model_id: str,
-    is_reasoning_model: bool,
+    generative_type: GenerativeType | None,
 ) -> list[str]:
     """Get the stop tokens for a generative model.
@@ -1074,9 +1108,8 @@ def get_custom_stop_tokens(
             The tokeniser.
         model_id:
             The model ID.
-        is_reasoning_model:
-            Whether the model is a reasoning model. This is used to determine the number
-            of generated tokens to allow before stopping the generation.
+        generative_type:
+            The generative type of the model.
     Returns:
         A list of stop tokens.
@@ -1086,12 +1119,18 @@ def get_custom_stop_tokens(
     prompt = "Hello"
     if has_chat_template(tokeniser=tokeniser):
         templated_prompt = apply_chat_template(
-            conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
+            conversation=[dict(role="user", content=prompt)],
+            tokeniser=tokeniser,
+            tokenise=False,
+            add_generation_prompt=True,
+            enable_thinking=generative_type == GenerativeType.REASONING,
         )
         assert isinstance(templated_prompt, str)
         prompt = templated_prompt
-    max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
+    max_tokens = (
+        REASONING_MAX_TOKENS if generative_type == GenerativeType.REASONING else 10
+    )
     completion = (
         model.generate(
             prompts=[prompt],

euroeval/benchmarker.py CHANGED Viewed

@@ -19,7 +19,7 @@ from .constants import GENERATIVE_PIPELINE_TAGS
 from .data_loading import load_data
 from .data_models import BenchmarkConfigParams, BenchmarkResult
 from .dataset_configs import get_all_dataset_configs
-from .enums import Device, ModelType
+from .enums import Device, GenerativeType, ModelType
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
 from .finetuning import finetune
 from .generation import generate
@@ -79,6 +79,7 @@ class Benchmarker:
         api_base: str | None = None,
         api_version: str | None = None,
         gpu_memory_utilization: float = 0.9,
+        generative_type: GenerativeType | None = None,
         debug: bool = False,
         run_with_cli: bool = False,
         requires_safetensors: bool = False,
@@ -151,6 +152,10 @@ class Benchmarker:
                 is generative. A larger value will result in faster evaluation, but at
                 the risk of running out of GPU memory. Only reduce this if you are
                 running out of GPU memory. Defaults to 0.9.
+            generative_type:
+                The type of generative model to benchmark. Only relevant if the model is
+                generative. If not specified, then the type will be inferred based on
+                the tags of the model. Defaults to None.
             debug:
                 Whether to output debug information. Defaults to False.
             run_with_cli:
@@ -199,6 +204,7 @@ class Benchmarker:
             api_base=api_base,
             api_version=api_version,
             gpu_memory_utilization=gpu_memory_utilization,
+            generative_type=generative_type,
             debug=debug,
             run_with_cli=run_with_cli,
             requires_safetensors=requires_safetensors,
@@ -438,7 +444,7 @@ class Benchmarker:
                 # Skip if the model type should not be benchmarked on this dataset
                 model_type = model_config.model_type
-                allowed_model_types = dataset_config.task.allowed_model_types
+                allowed_model_types = dataset_config.allowed_model_types
                 if model_type not in allowed_model_types:
                     logger.debug(
                         f"Skipping benchmarking {model_id} on "
@@ -804,6 +810,7 @@ class Benchmarker:
                     scores=scores,
                     model_id=model_config.model_id,
                     model_revision=model_config.revision,
+                    model_param=model_config.param,
                 )
                 record = BenchmarkResult(
@@ -1108,6 +1115,8 @@ def initial_logging(
     model_id = model_config.model_id
     if model_config.revision and model_config.revision != "main":
         model_id += f"@{model_config.revision}"
+    if model_config.param is not None:
+        model_id += f"#{model_config.param}"
     split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
     if model_config.task in GENERATIVE_PIPELINE_TAGS:

euroeval/cli.py CHANGED Viewed

@@ -4,7 +4,7 @@ import click
 from .benchmarker import Benchmarker
 from .dataset_configs import get_all_dataset_configs
-from .enums import Device
+from .enums import Device, GenerativeType
 from .languages import get_all_languages
 from .tasks import get_all_tasks
@@ -208,6 +208,14 @@ from .tasks import get_all_tasks
     help="Only allow loading models that have safetensors weights available",
     default=False,
 )
+@click.option(
+    "--generative-type",
+    type=click.Choice(["base", "instruction_tuned", "reasoning"]),
+    default=None,
+    show_default=True,
+    help="The type of generative model. Only relevant if the model is generative. If "
+    "not specified, the type will be inferred automatically.",
+)
 def benchmark(
     model: tuple[str],
     dataset: tuple[str],
@@ -234,6 +242,7 @@ def benchmark(
     gpu_memory_utilization: float,
     debug: bool,
     requires_safetensors: bool,
+    generative_type: str | None,
 ) -> None:
     """Benchmark pretrained language models on language tasks."""
     models = list(model)
@@ -244,6 +253,9 @@ def benchmark(
     tasks = None if len(task) == 0 else list(task)
     batch_size_int = int(batch_size)
     device = Device[device.upper()] if device is not None else None
+    generative_type_obj = (
+        GenerativeType[generative_type.upper()] if generative_type else None
+    )
     benchmarker = Benchmarker(
         language=languages,
@@ -268,6 +280,7 @@ def benchmark(
         api_base=api_base,
         api_version=api_version,
         gpu_memory_utilization=gpu_memory_utilization,
+        generative_type=generative_type_obj,
         debug=debug,
         run_with_cli=True,
         requires_safetensors=requires_safetensors,

euroeval/constants.py CHANGED Viewed

@@ -15,7 +15,7 @@ MAX_CONTEXT_LENGTH = 8_192
 # We need to raise the amount of tokens generated for reasoning models, to give them
 # time to think
-REASONING_MAX_TOKENS = 32_768
+REASONING_MAX_TOKENS = 8_192
 # The Hugging Face Hub pipeline tags used to classify models as generative

euroeval/data_models.py CHANGED Viewed

@@ -118,14 +118,14 @@ class Task:
             log probabilities for the generated tokens. Defaults to False.
         requires_logprobs (optional):
             Whether the task requires log probabilities. Implies `uses_logprobs`.
-        allowed_model_types (optional):
+        default_allowed_model_types (optional):
             A list of model types that are allowed to be evaluated on this task.
             Defaults to all model types being allowed.
-        allowed_generative_types (optional):
+        default_allowed_generative_types (optional):
             A list of generative model types that are allowed to be evaluated on this
             task. If None, all generative model types are allowed. Only relevant if
             `allowed_model_types` includes generative models.
-        allow_invalid_model_outputs (optional):
+        default_allow_invalid_model_outputs (optional):
             Whether to allow invalid model outputs. This is only relevant for generative
             models on classification tasks, where the model may generate an output
             which is not one of the allowed labels. If True, the model output will be
@@ -144,17 +144,17 @@ class Task:
     uses_structured_output: bool = False
     uses_logprobs: bool = False
     requires_logprobs: bool = False
-    allowed_model_types: list[ModelType] = field(
+    default_allowed_model_types: list[ModelType] = field(
         default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
     )
-    allowed_generative_types: list[GenerativeType] = field(
+    default_allowed_generative_types: list[GenerativeType] = field(
         default_factory=lambda: [
             GenerativeType.BASE,
             GenerativeType.INSTRUCTION_TUNED,
             GenerativeType.REASONING,
         ]
     )
-    allow_invalid_model_outputs: bool = True
+    default_allow_invalid_model_outputs: bool = True
     def __post_init__(self) -> None:
         """Post-initialisation checks."""
@@ -225,6 +225,9 @@ class BenchmarkConfig:
             Whether the benchmark is being run with the CLI.
         requires_safetensors:
             Whether to only allow models that use the safetensors format.
+        generative_type:
+            The type of generative model to benchmark. Only relevant if the model is
+            generative.
     """
     model_languages: list[Language]
@@ -251,6 +254,7 @@ class BenchmarkConfig:
     debug: bool
     run_with_cli: bool
     requires_safetensors: bool
+    generative_type: GenerativeType | None
 class BenchmarkConfigParams(pydantic.BaseModel):
@@ -280,6 +284,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
     api_base: str | None
     api_version: str | None
     gpu_memory_utilization: float
+    generative_type: GenerativeType | None
     debug: bool
     run_with_cli: bool
     requires_safetensors: bool
@@ -402,6 +407,21 @@ class DatasetConfig:
             to a 1:1 mapping between the labels and themselves. If None then the mapping
             will be set to the default mapping for the task and language. Defaults to
             None.
+        _allowed_model_types (optional):
+            A list of model types that are allowed to be evaluated on this dataset.
+            Defaults to the one for the task.
+        _allowed_generative_types (optional):
+            A list of generative model types that are allowed to be evaluated on this
+            dataset. If None, all generative model types are allowed. Only relevant if
+            `allowed_model_types` includes generative models. Defaults to the one for
+            the task.
+        _allow_invalid_model_outputs (optional):
+            Whether to allow invalid model outputs. This is only relevant for
+            generative models on classification tasks, where the model may generate an
+            output which is not one of the allowed labels. If True, the model output
+            will be mapped to the closest valid label. If False, the model output will
+            be considered incorrect and the evaluation will be aborted. Defaults to
+            the one for the task.
         splits (optional):
             The names of the splits in the dataset. If not provided, defaults to
             ["train", "val", "test"].
@@ -423,6 +443,9 @@ class DatasetConfig:
     _max_generated_tokens: int | None = None
     _labels: list[str] | None = None
     _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
+    _allowed_model_types: list[ModelType] | None = None
+    _allowed_generative_types: list[GenerativeType] | None = None
+    _allow_invalid_model_outputs: bool | None = None
     splits: list[str] = field(default_factory=lambda: ["train", "val", "test"])
     bootstrap_samples: bool = True
     unofficial: bool = False
@@ -502,6 +525,33 @@ class DatasetConfig:
         else:
             return prompt_config.default_prompt_label_mapping
+    @property
+    def allowed_model_types(self) -> list[ModelType]:
+        """A list of model types that are allowed to be evaluated on this dataset."""
+        return (
+            self._allowed_model_types
+            if self._allowed_model_types is not None
+            else self.task.default_allowed_model_types
+        )
+    @property
+    def allowed_generative_types(self) -> list[GenerativeType]:
+        """A list of generative model types that are allowed on this dataset."""
+        return (
+            self._allowed_generative_types
+            if self._allowed_generative_types is not None
+            else self.task.default_allowed_generative_types
+        )
+    @property
+    def allow_invalid_model_outputs(self) -> bool:
+        """Whether to allow invalid model outputs."""
+        return (
+            self._allow_invalid_model_outputs
+            if self._allow_invalid_model_outputs is not None
+            else self.task.default_allow_invalid_model_outputs
+        )
     @property
     def id2label(self) -> dict[int, str]:
         """The mapping from ID to label."""
@@ -573,6 +623,8 @@ class ModelConfig:
             The ID of the model.
         revision:
             The revision of the model.
+        param:
+            The parameter of the model, or None if the model has no parameters.
         task:
             The task that the model was trained on.
         languages:
@@ -594,6 +646,7 @@ class ModelConfig:
     model_id: str
     revision: str
+    param: str | None
     task: str
     languages: list[Language]
     inference_backend: "InferenceBackend"
@@ -707,3 +760,21 @@ class PromptConfig:
     default_prompt_template: str
     default_instruction_prompt: str
     default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
+@dataclass
+class ModelIdComponents:
+    """A model ID split into its components.
+    Attributes:
+        model_id:
+            The main model ID without revision or parameters.
+        revision:
+            The revision of the model, if any.
+        param:
+            The parameter of the model, if any.
+    """
+    model_id: str
+    revision: str
+    param: str | None

euroeval/dataset_configs/__init__.py CHANGED Viewed

@@ -15,6 +15,7 @@ from .icelandic import *  # noqa: F403
 from .italian import *  # noqa: F403
 from .latvian import *  # noqa: F403
 from .norwegian import *  # noqa: F403
+from .polish import *  # noqa: F403
 from .portuguese import *  # noqa: F403
 from .spanish import *  # noqa: F403
 from .swedish import *  # noqa: F403

euroeval/dataset_configs/danish.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Danish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import DA
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -149,6 +150,19 @@ GOLDENSWAG_DA_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_DA_CONFIG = DatasetConfig(
+    name="winogrande-da",
+    pretty_name="the Danish common-sense reasoning dataset Winogrande-da, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-da",
+    task=COMMON_SENSE,
+    languages=[DA],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
     name="european-values-situational-da",
     pretty_name="the Danish version of the European values evaluation dataset, where "

euroeval/dataset_configs/dutch.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Dutch dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import NL
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -142,6 +143,19 @@ GOLDENSWAG_NL_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_NL_CONFIG = DatasetConfig(
+    name="winogrande-nl",
+    pretty_name="the Dutch common-sense reasoning dataset Winogrande-nl, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-nl",
+    task=COMMON_SENSE,
+    languages=[NL],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_NL_CONFIG = DatasetConfig(
     name="european-values-situational-nl",
     pretty_name="the Dutch version of the European values evaluation dataset, where "

euroeval/dataset_configs/english.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All English dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import EN
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -80,6 +81,15 @@ EUROPEAN_VALUES_EN_CONFIG = DatasetConfig(
 ### Unofficial datasets ###
+XQUAD_EN_CONFIG = DatasetConfig(
+    name="xquad-en",
+    pretty_name="the English version of the reading comprehension dataset XQuAD",
+    huggingface_id="EuroEval/xquad-en",
+    task=RC,
+    languages=[EN],
+    unofficial=True,
+)
 ARC_CONFIG = DatasetConfig(
     name="arc",
     pretty_name="the truncated version of the English knowledge dataset ARC",
@@ -117,6 +127,18 @@ MULTI_WIKI_QA_EN_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_CONFIG = DatasetConfig(
+    name="winogrande",
+    pretty_name="the English common-sense reasoning dataset Winogrande",
+    huggingface_id="EuroEval/winogrande-en",
+    task=COMMON_SENSE,
+    languages=[EN],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_EN_CONFIG = DatasetConfig(
     name="european-values-situational-en",
     pretty_name="the English version of the European values evaluation dataset, where "

EuroEval 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl

Potentially problematic release.

EuroEval 16.0.1py3-none-any.whl → 16.1.1py3-none-any.whl