PyPI - EuroEval - Versions diffs - 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl - Mend

EuroEval 16.0.0py3-none-any.whl → 16.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (51) hide show

euroeval/__init__.py +5 -0
euroeval/benchmark_config_factory.py +6 -1
euroeval/benchmark_modules/base.py +2 -0
euroeval/benchmark_modules/fresh.py +7 -1
euroeval/benchmark_modules/hf.py +26 -21
euroeval/benchmark_modules/litellm.py +258 -131
euroeval/benchmark_modules/vllm.py +120 -68
euroeval/benchmarker.py +11 -2
euroeval/cli.py +14 -1
euroeval/constants.py +7 -1
euroeval/data_models.py +95 -20
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +14 -3
euroeval/dataset_configs/dutch.py +14 -0
euroeval/dataset_configs/english.py +22 -0
euroeval/dataset_configs/estonian.py +15 -7
euroeval/dataset_configs/finnish.py +14 -0
euroeval/dataset_configs/french.py +14 -0
euroeval/dataset_configs/german.py +23 -0
euroeval/dataset_configs/italian.py +14 -0
euroeval/dataset_configs/latvian.py +14 -0
euroeval/dataset_configs/norwegian.py +14 -0
euroeval/dataset_configs/polish.py +126 -0
euroeval/dataset_configs/portuguese.py +14 -0
euroeval/dataset_configs/spanish.py +14 -0
euroeval/dataset_configs/swedish.py +25 -0
euroeval/enums.py +12 -0
euroeval/generation.py +17 -8
euroeval/generation_utils.py +102 -16
euroeval/metrics/pipeline.py +51 -9
euroeval/model_cache.py +13 -1
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +27 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +11 -0
euroeval/prompt_templates/sentiment_classification.py +15 -0
euroeval/prompt_templates/summarization.py +27 -1
euroeval/scores.py +5 -0
euroeval/task_group_utils/multiple_choice_classification.py +2 -2
euroeval/task_group_utils/question_answering.py +29 -29
euroeval/task_group_utils/sequence_classification.py +71 -81
euroeval/task_group_utils/token_classification.py +17 -3
euroeval/tasks.py +12 -10
euroeval/{tokenization_utils.py → tokenisation_utils.py} +41 -25
euroeval/utils.py +67 -3
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/METADATA +3 -1
euroeval-16.1.0.dist-info/RECORD +70 -0
euroeval-16.0.0.dist-info/RECORD +0 -69
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0

euroeval/data_models.py CHANGED Viewed

@@ -118,13 +118,19 @@ class Task:
             log probabilities for the generated tokens. Defaults to False.
         requires_logprobs (optional):
             Whether the task requires log probabilities. Implies `uses_logprobs`.
-        allowed_model_types (optional):
+        default_allowed_model_types (optional):
             A list of model types that are allowed to be evaluated on this task.
             Defaults to all model types being allowed.
-        allowed_generative_types (optional):
+        default_allowed_generative_types (optional):
             A list of generative model types that are allowed to be evaluated on this
             task. If None, all generative model types are allowed. Only relevant if
             `allowed_model_types` includes generative models.
+        default_allow_invalid_model_outputs (optional):
+            Whether to allow invalid model outputs. This is only relevant for generative
+            models on classification tasks, where the model may generate an output
+            which is not one of the allowed labels. If True, the model output will be
+            mapped to the closest valid label. If False, the model output will be
+            considered incorrect and the evaluation will be aborted. Defaults to True.
     """
     name: str
@@ -138,16 +144,17 @@ class Task:
     uses_structured_output: bool = False
     uses_logprobs: bool = False
     requires_logprobs: bool = False
-    allowed_model_types: list[ModelType] = field(
+    default_allowed_model_types: list[ModelType] = field(
         default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
     )
-    allowed_generative_types: list[GenerativeType] = field(
+    default_allowed_generative_types: list[GenerativeType] = field(
         default_factory=lambda: [
             GenerativeType.BASE,
             GenerativeType.INSTRUCTION_TUNED,
             GenerativeType.REASONING,
         ]
     )
+    default_allow_invalid_model_outputs: bool = True
     def __post_init__(self) -> None:
         """Post-initialisation checks."""
@@ -218,6 +225,9 @@ class BenchmarkConfig:
             Whether the benchmark is being run with the CLI.
         requires_safetensors:
             Whether to only allow models that use the safetensors format.
+        generative_type:
+            The type of generative model to benchmark. Only relevant if the model is
+            generative.
     """
     model_languages: list[Language]
@@ -244,6 +254,7 @@ class BenchmarkConfig:
     debug: bool
     run_with_cli: bool
     requires_safetensors: bool
+    generative_type: GenerativeType | None
 class BenchmarkConfigParams(pydantic.BaseModel):
@@ -273,6 +284,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
     api_base: str | None
     api_version: str | None
     gpu_memory_utilization: float
+    generative_type: GenerativeType | None
     debug: bool
     run_with_cli: bool
     requires_safetensors: bool
@@ -395,6 +407,21 @@ class DatasetConfig:
             to a 1:1 mapping between the labels and themselves. If None then the mapping
             will be set to the default mapping for the task and language. Defaults to
             None.
+        _allowed_model_types (optional):
+            A list of model types that are allowed to be evaluated on this dataset.
+            Defaults to the one for the task.
+        _allowed_generative_types (optional):
+            A list of generative model types that are allowed to be evaluated on this
+            dataset. If None, all generative model types are allowed. Only relevant if
+            `allowed_model_types` includes generative models. Defaults to the one for
+            the task.
+        _allow_invalid_model_outputs (optional):
+            Whether to allow invalid model outputs. This is only relevant for
+            generative models on classification tasks, where the model may generate an
+            output which is not one of the allowed labels. If True, the model output
+            will be mapped to the closest valid label. If False, the model output will
+            be considered incorrect and the evaluation will be aborted. Defaults to
+            the one for the task.
         splits (optional):
             The names of the splits in the dataset. If not provided, defaults to
             ["train", "val", "test"].
@@ -416,6 +443,9 @@ class DatasetConfig:
     _max_generated_tokens: int | None = None
     _labels: list[str] | None = None
     _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
+    _allowed_model_types: list[ModelType] | None = None
+    _allowed_generative_types: list[GenerativeType] | None = None
+    _allow_invalid_model_outputs: bool | None = None
     splits: list[str] = field(default_factory=lambda: ["train", "val", "test"])
     bootstrap_samples: bool = True
     unofficial: bool = False
@@ -430,7 +460,6 @@ class DatasetConfig:
             if self._prompt_prefix is None
             else self._prompt_prefix
         )
-        prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
         return prompt_prefix
     @property
@@ -443,7 +472,6 @@ class DatasetConfig:
             if self._prompt_template is None
             else self._prompt_template
         )
-        prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
         return prompt_template
     @property
@@ -456,9 +484,6 @@ class DatasetConfig:
             if self._instruction_prompt is None
             else self._instruction_prompt
         )
-        instruction_prompt = instruction_prompt.replace(
-            "{labels_str}", self._labels_str
-        )
         return instruction_prompt
     @property
@@ -500,6 +525,33 @@ class DatasetConfig:
         else:
             return prompt_config.default_prompt_label_mapping
+    @property
+    def allowed_model_types(self) -> list[ModelType]:
+        """A list of model types that are allowed to be evaluated on this dataset."""
+        return (
+            self._allowed_model_types
+            if self._allowed_model_types is not None
+            else self.task.default_allowed_model_types
+        )
+    @property
+    def allowed_generative_types(self) -> list[GenerativeType]:
+        """A list of generative model types that are allowed on this dataset."""
+        return (
+            self._allowed_generative_types
+            if self._allowed_generative_types is not None
+            else self.task.default_allowed_generative_types
+        )
+    @property
+    def allow_invalid_model_outputs(self) -> bool:
+        """Whether to allow invalid model outputs."""
+        return (
+            self._allow_invalid_model_outputs
+            if self._allow_invalid_model_outputs is not None
+            else self.task.default_allow_invalid_model_outputs
+        )
     @property
     def id2label(self) -> dict[int, str]:
         """The mapping from ID to label."""
@@ -519,15 +571,16 @@ class DatasetConfig:
         """Return a hash of the dataset configuration."""
         return hash(self.name)
-    @property
-    def _labels_str(self) -> str:
+    def get_labels_str(self, labels: list[str] | None = None) -> str:
         """Converts a set of labels to a natural string, in the specified language.
         If the task is NER, we separate using 'and' and use the mapped labels instead of
         the BIO NER labels.
         Args:
-            language: The language to be used when converting the labels.
+            labels (optional):
+                The labels to convert to a natural string. If None, uses all the labels
+                in the dataset. Defaults to None.
         Returns:
             The natural string representation of the labels in specified language.
@@ -539,16 +592,17 @@ class DatasetConfig:
         else:
             sep_word = main_language.or_separator
-        local_labels: list[str] = []
-        for label in self.labels:
-            if label not in self.prompt_label_mapping:
-                continue
-            local_label = self.prompt_label_mapping[label]
-            if local_label not in local_labels:
-                local_labels.append(local_label)
+        if labels is None:
+            labels = list()
+            for english_label in self.labels:
+                if english_label not in self.prompt_label_mapping:
+                    continue
+                label = self.prompt_label_mapping[english_label]
+                if label not in labels:
+                    labels.append(label)
         # Convert labels to single-quoted labels - and remove duplicates
-        quoted_labels = [f"'{label}'" for label in local_labels]
+        quoted_labels = [f"'{label}'" for label in labels]
         if not quoted_labels:
             return ""
@@ -569,6 +623,8 @@ class ModelConfig:
             The ID of the model.
         revision:
             The revision of the model.
+        param:
+            The parameter of the model, or None if the model has no parameters.
         task:
             The task that the model was trained on.
         languages:
@@ -590,6 +646,7 @@ class ModelConfig:
     model_id: str
     revision: str
+    param: str | None
     task: str
     languages: list[Language]
     inference_backend: "InferenceBackend"
@@ -703,3 +760,21 @@ class PromptConfig:
     default_prompt_template: str
     default_instruction_prompt: str
     default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
+@dataclass
+class ModelIdComponents:
+    """A model ID split into its components.
+    Attributes:
+        model_id:
+            The main model ID without revision or parameters.
+        revision:
+            The revision of the model, if any.
+        param:
+            The parameter of the model, if any.
+    """
+    model_id: str
+    revision: str
+    param: str | None

euroeval/dataset_configs/__init__.py CHANGED Viewed

@@ -15,6 +15,7 @@ from .icelandic import *  # noqa: F403
 from .italian import *  # noqa: F403
 from .latvian import *  # noqa: F403
 from .norwegian import *  # noqa: F403
+from .polish import *  # noqa: F403
 from .portuguese import *  # noqa: F403
 from .spanish import *  # noqa: F403
 from .swedish import *  # noqa: F403

euroeval/dataset_configs/danish.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Danish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import DA
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -84,7 +85,6 @@ EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
     languages=[DA],
     splits=["test"],
     bootstrap_samples=False,
-    _instruction_prompt="{text}",
 )
@@ -150,6 +150,19 @@ GOLDENSWAG_DA_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_DA_CONFIG = DatasetConfig(
+    name="winogrande-da",
+    pretty_name="the Danish common-sense reasoning dataset Winogrande-da, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-da",
+    task=COMMON_SENSE,
+    languages=[DA],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
     name="european-values-situational-da",
     pretty_name="the Danish version of the European values evaluation dataset, where "
@@ -159,7 +172,6 @@ EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
     languages=[DA],
     splits=["test"],
     bootstrap_samples=False,
-    _instruction_prompt="{text}",
     unofficial=True,
 )
@@ -172,6 +184,5 @@ EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
     languages=[DA],
     splits=["test"],
     bootstrap_samples=False,
-    _instruction_prompt="{text}",
     unofficial=True,
 )

euroeval/dataset_configs/dutch.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Dutch dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import NL
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -142,6 +143,19 @@ GOLDENSWAG_NL_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_NL_CONFIG = DatasetConfig(
+    name="winogrande-nl",
+    pretty_name="the Dutch common-sense reasoning dataset Winogrande-nl, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-nl",
+    task=COMMON_SENSE,
+    languages=[NL],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_NL_CONFIG = DatasetConfig(
     name="european-values-situational-nl",
     pretty_name="the Dutch version of the European values evaluation dataset, where "

euroeval/dataset_configs/english.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All English dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import EN
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -80,6 +81,15 @@ EUROPEAN_VALUES_EN_CONFIG = DatasetConfig(
 ### Unofficial datasets ###
+XQUAD_EN_CONFIG = DatasetConfig(
+    name="xquad-en",
+    pretty_name="the English version of the reading comprehension dataset XQuAD",
+    huggingface_id="EuroEval/xquad-en",
+    task=RC,
+    languages=[EN],
+    unofficial=True,
+)
 ARC_CONFIG = DatasetConfig(
     name="arc",
     pretty_name="the truncated version of the English knowledge dataset ARC",
@@ -117,6 +127,18 @@ MULTI_WIKI_QA_EN_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_CONFIG = DatasetConfig(
+    name="winogrande",
+    pretty_name="the English common-sense reasoning dataset Winogrande",
+    huggingface_id="EuroEval/winogrande-en",
+    task=COMMON_SENSE,
+    languages=[EN],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_EN_CONFIG = DatasetConfig(
     name="european-values-situational-en",
     pretty_name="the English version of the European values evaluation dataset, where "

euroeval/dataset_configs/estonian.py CHANGED Viewed

@@ -47,13 +47,12 @@ ERR_NEWS_CONFIG = DatasetConfig(
     languages=[ET],
 )
-EXAM_ET_CONFIG = DatasetConfig(
-    name="exam-et",
-    pretty_name="the Estonian knowledge assessment dataset Exam-et",
-    huggingface_id="EuroEval/exam-et",
+TRIVIA_ET_CONFIG = DatasetConfig(
+    name="trivia-et",
+    pretty_name="the Estonian knowledge dataset Trivia-et",
+    huggingface_id="EuroEval/trivia-et",
     task=KNOW,
     languages=[ET],
-    _labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
 )
 WINOGRANDE_ET_CONFIG = DatasetConfig(
@@ -82,8 +81,7 @@ EUROPEAN_VALUES_ET_CONFIG = DatasetConfig(
     _instruction_prompt="{text}",
 )
-### Unofficial datasets ###
+### Unofficial datasets ###
 SCALA_ET_CONFIG = DatasetConfig(
     name="scala-et",
@@ -93,3 +91,13 @@ SCALA_ET_CONFIG = DatasetConfig(
     languages=[ET],
     unofficial=True,
 )
+EXAM_ET_CONFIG = DatasetConfig(
+    name="exam-et",
+    pretty_name="the Estonian knowledge assessment dataset Exam-et",
+    huggingface_id="EuroEval/exam-et",
+    task=KNOW,
+    languages=[ET],
+    _labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
+    unofficial=True,
+)

euroeval/dataset_configs/finnish.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Finnish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import FI
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
@@ -101,6 +102,19 @@ GOLDENSWAG_FI_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_FI_CONFIG = DatasetConfig(
+    name="winogrande-fi",
+    pretty_name="the Finnish common-sense reasoning dataset Winogrande-fi, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-fi",
+    task=COMMON_SENSE,
+    languages=[FI],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_FI_CONFIG = DatasetConfig(
     name="european-values-situational-fi",
     pretty_name="the Finnish version of the European values evaluation dataset, where "

euroeval/dataset_configs/french.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All French dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import FR
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -113,6 +114,19 @@ GOLDENSWAG_FR_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_FR_CONFIG = DatasetConfig(
+    name="winogrande-fr",
+    pretty_name="the French common-sense reasoning dataset Winogrande-fr, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-fr",
+    task=COMMON_SENSE,
+    languages=[FR],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_FR_CONFIG = DatasetConfig(
     name="european-values-situational-fr",
     pretty_name="the French version of the European values evaluation dataset, where "

euroeval/dataset_configs/german.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All German dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import DE
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -81,6 +82,15 @@ EUROPEAN_VALUES_DE_CONFIG = DatasetConfig(
 ### Unofficial datasets ###
+XQUAD_DE_CONFIG = DatasetConfig(
+    name="xquad-de",
+    pretty_name="the German version of the reading comprehension dataset XQuAD",
+    huggingface_id="EuroEval/xquad-de",
+    task=RC,
+    languages=[DE],
+    unofficial=True,
+)
 ARC_DE_CONFIG = DatasetConfig(
     name="arc-de",
     pretty_name="the truncated version of the German knowledge dataset ARC-de, "
@@ -121,6 +131,19 @@ GOLDENSWAG_DE_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_DE_CONFIG = DatasetConfig(
+    name="winogrande-de",
+    pretty_name="the German common-sense reasoning dataset Winogrande-de, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-de",
+    task=COMMON_SENSE,
+    languages=[DE],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_DE_CONFIG = DatasetConfig(
     name="european-values-situational-de",
     pretty_name="the German version of the European values evaluation dataset, where "

euroeval/dataset_configs/italian.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Italian dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import IT
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -121,6 +122,19 @@ GOLDENSWAG_IT_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_IT_CONFIG = DatasetConfig(
+    name="winogrande-it",
+    pretty_name="the Italian common-sense reasoning dataset Winogrande-it, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-it",
+    task=COMMON_SENSE,
+    languages=[IT],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_IT_CONFIG = DatasetConfig(
     name="european-values-situational-it",
     pretty_name="the Italian version of the European values evaluation dataset, "

euroeval/dataset_configs/latvian.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Latvian dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import LV
 from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
@@ -79,3 +80,16 @@ WIKIANN_LV_CONFIG = DatasetConfig(
     languages=[LV],
     unofficial=True,
 )
+WINOGRANDE_LV_CONFIG = DatasetConfig(
+    name="winogrande-lv",
+    pretty_name="the Latvian common-sense reasoning dataset Winogrande-lv, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-lv",
+    task=COMMON_SENSE,
+    languages=[LV],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)

euroeval/dataset_configs/norwegian.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Norwegian dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import NB, NN, NO
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -216,6 +217,19 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_NO_CONFIG = DatasetConfig(
+    name="winogrande-no",
+    pretty_name="the Norwegian common-sense reasoning dataset Winogrande-no, "
+    "translated from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-no",
+    task=COMMON_SENSE,
+    languages=[NB, NN, NO],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_NO_CONFIG = DatasetConfig(
     name="european-values-situational-no",
     pretty_name="the Norwegian version of the European values evaluation dataset, "

EuroEval 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.0.0py3-none-any.whl → 16.1.0py3-none-any.whl