PyPI - EuroEval - Versions diffs - 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl - Mend

EuroEval 16.0.1py3-none-any.whl → 16.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (48) hide show

euroeval/benchmark_config_factory.py +6 -1
euroeval/benchmark_modules/base.py +2 -0
euroeval/benchmark_modules/fresh.py +7 -1
euroeval/benchmark_modules/hf.py +26 -21
euroeval/benchmark_modules/litellm.py +258 -131
euroeval/benchmark_modules/vllm.py +79 -40
euroeval/benchmarker.py +11 -2
euroeval/cli.py +14 -1
euroeval/constants.py +1 -1
euroeval/data_models.py +77 -6
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +14 -0
euroeval/dataset_configs/dutch.py +14 -0
euroeval/dataset_configs/english.py +22 -0
euroeval/dataset_configs/estonian.py +15 -7
euroeval/dataset_configs/finnish.py +14 -0
euroeval/dataset_configs/french.py +14 -0
euroeval/dataset_configs/german.py +23 -0
euroeval/dataset_configs/italian.py +14 -0
euroeval/dataset_configs/latvian.py +14 -0
euroeval/dataset_configs/norwegian.py +14 -0
euroeval/dataset_configs/polish.py +126 -0
euroeval/dataset_configs/portuguese.py +14 -0
euroeval/dataset_configs/spanish.py +14 -0
euroeval/dataset_configs/swedish.py +25 -0
euroeval/enums.py +12 -0
euroeval/generation.py +17 -8
euroeval/generation_utils.py +65 -11
euroeval/metrics/pipeline.py +1 -1
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +27 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +11 -0
euroeval/prompt_templates/sentiment_classification.py +15 -0
euroeval/prompt_templates/summarization.py +27 -1
euroeval/scores.py +5 -0
euroeval/task_group_utils/question_answering.py +29 -29
euroeval/task_group_utils/sequence_classification.py +11 -34
euroeval/task_group_utils/token_classification.py +3 -3
euroeval/tasks.py +4 -4
euroeval/{tokenization_utils.py → tokenisation_utils.py} +50 -28
euroeval/utils.py +36 -3
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/METADATA +1 -1
euroeval-16.1.1.dist-info/RECORD +70 -0
euroeval-16.0.1.dist-info/RECORD +0 -69
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/WHEEL +0 -0
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/entry_points.txt +0 -0
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/licenses/LICENSE +0 -0

euroeval/dataset_configs/estonian.py CHANGED Viewed

@@ -47,13 +47,12 @@ ERR_NEWS_CONFIG = DatasetConfig(
     languages=[ET],
 )
-EXAM_ET_CONFIG = DatasetConfig(
-    name="exam-et",
-    pretty_name="the Estonian knowledge assessment dataset Exam-et",
-    huggingface_id="EuroEval/exam-et",
+TRIVIA_ET_CONFIG = DatasetConfig(
+    name="trivia-et",
+    pretty_name="the Estonian knowledge dataset Trivia-et",
+    huggingface_id="EuroEval/trivia-et",
     task=KNOW,
     languages=[ET],
-    _labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
 )
 WINOGRANDE_ET_CONFIG = DatasetConfig(
@@ -82,8 +81,7 @@ EUROPEAN_VALUES_ET_CONFIG = DatasetConfig(
     _instruction_prompt="{text}",
 )
-### Unofficial datasets ###
+### Unofficial datasets ###
 SCALA_ET_CONFIG = DatasetConfig(
     name="scala-et",
@@ -93,3 +91,13 @@ SCALA_ET_CONFIG = DatasetConfig(
     languages=[ET],
     unofficial=True,
 )
+EXAM_ET_CONFIG = DatasetConfig(
+    name="exam-et",
+    pretty_name="the Estonian knowledge assessment dataset Exam-et",
+    huggingface_id="EuroEval/exam-et",
+    task=KNOW,
+    languages=[ET],
+    _labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
+    unofficial=True,
+)

euroeval/dataset_configs/finnish.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Finnish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import FI
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
@@ -101,6 +102,19 @@ GOLDENSWAG_FI_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_FI_CONFIG = DatasetConfig(
+    name="winogrande-fi",
+    pretty_name="the Finnish common-sense reasoning dataset Winogrande-fi, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-fi",
+    task=COMMON_SENSE,
+    languages=[FI],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_FI_CONFIG = DatasetConfig(
     name="european-values-situational-fi",
     pretty_name="the Finnish version of the European values evaluation dataset, where "

euroeval/dataset_configs/french.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All French dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import FR
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -113,6 +114,19 @@ GOLDENSWAG_FR_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_FR_CONFIG = DatasetConfig(
+    name="winogrande-fr",
+    pretty_name="the French common-sense reasoning dataset Winogrande-fr, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-fr",
+    task=COMMON_SENSE,
+    languages=[FR],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_FR_CONFIG = DatasetConfig(
     name="european-values-situational-fr",
     pretty_name="the French version of the European values evaluation dataset, where "

euroeval/dataset_configs/german.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All German dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import DE
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -81,6 +82,15 @@ EUROPEAN_VALUES_DE_CONFIG = DatasetConfig(
 ### Unofficial datasets ###
+XQUAD_DE_CONFIG = DatasetConfig(
+    name="xquad-de",
+    pretty_name="the German version of the reading comprehension dataset XQuAD",
+    huggingface_id="EuroEval/xquad-de",
+    task=RC,
+    languages=[DE],
+    unofficial=True,
+)
 ARC_DE_CONFIG = DatasetConfig(
     name="arc-de",
     pretty_name="the truncated version of the German knowledge dataset ARC-de, "
@@ -121,6 +131,19 @@ GOLDENSWAG_DE_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_DE_CONFIG = DatasetConfig(
+    name="winogrande-de",
+    pretty_name="the German common-sense reasoning dataset Winogrande-de, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-de",
+    task=COMMON_SENSE,
+    languages=[DE],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_DE_CONFIG = DatasetConfig(
     name="european-values-situational-de",
     pretty_name="the German version of the European values evaluation dataset, where "

euroeval/dataset_configs/italian.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Italian dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import IT
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -121,6 +122,19 @@ GOLDENSWAG_IT_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_IT_CONFIG = DatasetConfig(
+    name="winogrande-it",
+    pretty_name="the Italian common-sense reasoning dataset Winogrande-it, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-it",
+    task=COMMON_SENSE,
+    languages=[IT],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_IT_CONFIG = DatasetConfig(
     name="european-values-situational-it",
     pretty_name="the Italian version of the European values evaluation dataset, "

euroeval/dataset_configs/latvian.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Latvian dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import LV
 from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
@@ -79,3 +80,16 @@ WIKIANN_LV_CONFIG = DatasetConfig(
     languages=[LV],
     unofficial=True,
 )
+WINOGRANDE_LV_CONFIG = DatasetConfig(
+    name="winogrande-lv",
+    pretty_name="the Latvian common-sense reasoning dataset Winogrande-lv, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-lv",
+    task=COMMON_SENSE,
+    languages=[LV],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)

euroeval/dataset_configs/norwegian.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Norwegian dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import NB, NN, NO
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -216,6 +217,19 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_NO_CONFIG = DatasetConfig(
+    name="winogrande-no",
+    pretty_name="the Norwegian common-sense reasoning dataset Winogrande-no, "
+    "translated from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-no",
+    task=COMMON_SENSE,
+    languages=[NB, NN, NO],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_NO_CONFIG = DatasetConfig(
     name="european-values-situational-no",
     pretty_name="the Norwegian version of the European values evaluation dataset, "

euroeval/dataset_configs/polish.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""All Polish dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..enums import ModelType
+from ..languages import PL
+from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
+### Official datasets ###
+POLEMO2_CONFIG = DatasetConfig(
+    name="polemo2",
+    pretty_name="the Polish sentiment classification dataset PolEmo2",
+    huggingface_id="EuroEval/polemo2-mini",
+    task=SENT,
+    languages=[PL],
+)
+SCALA_PL_CONFIG = DatasetConfig(
+    name="scala-pl",
+    pretty_name="the Polish part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-pl",
+    task=LA,
+    languages=[PL],
+)
+KPWR_NER_CONFIG = DatasetConfig(
+    name="kpwr-ner",
+    pretty_name="the Polish entity recognition dataset KPWr-NER",
+    huggingface_id="EuroEval/kpwr-ner",
+    task=NER,
+    languages=[PL],
+)
+POQUAD_CONFIG = DatasetConfig(
+    name="poquad",
+    pretty_name="the Polish question answering dataset PoQuAD",
+    huggingface_id="EuroEval/poquad-mini",
+    task=RC,
+    languages=[PL],
+)
+PSC_CONFIG = DatasetConfig(
+    name="psc",
+    pretty_name="the Polish summarisation dataset PSC",
+    huggingface_id="EuroEval/psc-mini",
+    task=SUMM,
+    languages=[PL],
+)
+LLMZSZL_CONFIG = DatasetConfig(
+    name="llmzszl",
+    pretty_name="the Polish knowledge dataset LLMzSzŁ",
+    huggingface_id="EuroEval/llmzszl-mini",
+    task=KNOW,
+    languages=[PL],
+)
+WINOGRANDE_PL_CONFIG = DatasetConfig(
+    name="winogrande-pl",
+    pretty_name="the Polish common-sense reasoning dataset Winogrande-pl, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-pl",
+    task=COMMON_SENSE,
+    languages=[PL],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+)
+EUROPEAN_VALUES_PL_CONFIG = DatasetConfig(
+    name="european-values-pl",
+    pretty_name="the Polish version of the European values evaluation dataset",
+    huggingface_id="EuroEval/european-values-pl",
+    task=EUROPEAN_VALUES,
+    languages=[PL],
+    splits=["test"],
+    bootstrap_samples=False,
+    _instruction_prompt="{text}",
+)
+### Unofficial datasets ###
+MULTI_WIKI_QA_PL_CONFIG = DatasetConfig(
+    name="multi-wiki-qa-pl",
+    pretty_name="the truncated version of the Polish part of the reading "
+    "comprehension dataset MultiWikiQA",
+    huggingface_id="EuroEval/multi-wiki-qa-pl-mini",
+    task=RC,
+    languages=[PL],
+    unofficial=True,
+)
+GOLDENSWAG_PL_CONFIG = DatasetConfig(
+    name="goldenswag-pl",
+    pretty_name="the truncated version of the Polish common-sense reasoning "
+    "dataset GoldenSwag-pl, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-pl-mini",
+    task=COMMON_SENSE,
+    languages=[PL],
+    unofficial=True,
+)
+EUROPEAN_VALUES_SITUATIONAL_PL_CONFIG = DatasetConfig(
+    name="european-values-situational-pl",
+    pretty_name="the Polish version of the European values evaluation dataset, where "
+    "the questions are phrased in a situational way",
+    huggingface_id="EuroEval/european-values-situational-pl",
+    task=EUROPEAN_VALUES,
+    languages=[PL],
+    splits=["test"],
+    bootstrap_samples=False,
+    unofficial=True,
+)
+EUROPEAN_VALUES_COMPLETIONS_PL_CONFIG = DatasetConfig(
+    name="european-values-completions-pl",
+    pretty_name="the Polish version of the European values evaluation dataset, where "
+    "the questions are phrased as sentence completions",
+    huggingface_id="EuroEval/european-values-completions-pl",
+    task=EUROPEAN_VALUES,
+    languages=[PL],
+    splits=["test"],
+    bootstrap_samples=False,
+    unofficial=True,
+)

euroeval/dataset_configs/portuguese.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Portuguese dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import PT
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -91,6 +92,19 @@ BOOLQ_PT_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_PT_CONFIG = DatasetConfig(
+    name="winogrande-pt",
+    pretty_name="the Portuguese common-sense reasoning dataset Winogrande-pt, "
+    "translated from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-pt",
+    task=COMMON_SENSE,
+    languages=[PT],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_PT_CONFIG = DatasetConfig(
     name="european-values-situational-pt",
     pretty_name="the Portuguese version of the European values evaluation dataset, "

euroeval/dataset_configs/spanish.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Spanish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import ES
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -119,6 +120,19 @@ GOLDENSWAG_ES_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_ES_CONFIG = DatasetConfig(
+    name="winogrande-es",
+    pretty_name="the Spanish common-sense reasoning dataset Winogrande-es, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-es",
+    task=COMMON_SENSE,
+    languages=[ES],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_ES_CONFIG = DatasetConfig(
     name="european-values-situational-es",
     pretty_name="the Spanish version of the European values evaluation dataset, where "

euroeval/dataset_configs/swedish.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Swedish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import SV
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -130,6 +131,19 @@ GOLDENSWAG_SV_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_SV_CONFIG = DatasetConfig(
+    name="winogrande-sv",
+    pretty_name="the Swedish common-sense reasoning dataset Winogrande-sv, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-sv",
+    task=COMMON_SENSE,
+    languages=[SV],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_SV_CONFIG = DatasetConfig(
     name="european-values-situational-sv",
     pretty_name="the Swedish version of the European values evaluation dataset, where "
@@ -155,3 +169,14 @@ EUROPEAN_VALUES_COMPLETIONS_SV_CONFIG = DatasetConfig(
     _instruction_prompt="{text}",
     unofficial=True,
 )
+SKOLPROV_CONFIG = DatasetConfig(
+    name="skolprov",
+    pretty_name="the Swedish knowledge dataset Skolprov",
+    huggingface_id="EuroEval/skolprov",
+    task=KNOW,
+    languages=[SV],
+    splits=["train", "test"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)

euroeval/enums.py CHANGED Viewed

@@ -12,6 +12,14 @@ class AutoStrEnum(str, Enum):
     ) -> str:
         return name.lower()
+    def __str__(self) -> str:
+        """Return the value in upper case for better readability."""
+        return self.value.upper()
+    def __repr__(self) -> str:
+        """Return the value in upper case for better readability."""
+        return self.value.upper()
 class Device(AutoStrEnum):
     """The compute device to use for the evaluation.
@@ -60,6 +68,10 @@ class ModelType(AutoStrEnum):
     ENCODER = auto()
     GENERATIVE = auto()
+    def __repr__(self) -> str:
+        """Return the value in upper case for better readability."""
+        return self.value.upper()
 class GenerativeType(AutoStrEnum):
     """The type of a generative model.

euroeval/generation.py CHANGED Viewed

@@ -307,7 +307,7 @@ def debug_log(
                     for label in batch["label"]
                 ]
             else:
-                labels = ["N/A"] * len(extracted_labels)
+                labels = [None] * len(extracted_labels)
         case TaskGroup.QUESTION_ANSWERING:
             extracted_labels = [
@@ -330,12 +330,21 @@ def debug_log(
     else:
         input_texts = batch["text"]
-    for input_text, raw_output, prediction, label in zip(
-        input_texts, model_output.sequences, extracted_labels, labels
-    ):
+    metadata_keys: list[str] = [
+        key
+        for key in batch.keys()
+        if key not in ["text", "messages", "label", "labels", "target_text"]
+    ]
+    for idx in range(len(input_texts)):
+        data_to_log: dict[str, t.Any] = {
+            "Input": input_texts[idx],
+            "Raw output": model_output.sequences[idx],
+            "Prediction": extracted_labels[idx],
+        }
+        if labels[idx]:
+            data_to_log["Label"] = labels[idx]
+        data_to_log |= {key.capitalize(): batch[key][idx] for key in metadata_keys}
         logger.info(
-            f"Input: '{input_text}'\n"
-            f"Raw output: '{raw_output}'\n"
-            f"Prediction: '{prediction}'\n"
-            f"Label: '{label}'"
+            "\n".join(f"{key}: {value!r}" for key, value in data_to_log.items())
         )

euroeval/generation_utils.py CHANGED Viewed

@@ -4,11 +4,12 @@ import itertools as it
 import json
 import logging
 import random
+import re
 import typing as t
-from .enums import TaskGroup
-from .exceptions import InvalidBenchmark
-from .tokenization_utils import apply_chat_template
+from .enums import GenerativeType, TaskGroup
+from .exceptions import InvalidBenchmark, InvalidModel
+from .tokenisation_utils import apply_chat_template
 from .utils import extract_multiple_choice_labels, log_once
 if t.TYPE_CHECKING:
@@ -173,7 +174,7 @@ def apply_prompt(
     few_shot_examples: list[dict[str, t.Any]],
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
-    instruction_model: bool,
+    generative_type: GenerativeType | None,
     always_populate_text_field: bool,
     tokeniser: "PreTrainedTokenizer | None",
 ) -> dict[str, t.Any]:
@@ -184,10 +185,12 @@ def apply_prompt(
             The examples to apply the few-shot examples to.
         few_shot_examples:
             The few-shot examples to apply.
+        model_config:
+            The model configuration.
         dataset_config:
             The dataset configuration.
-        instruction_model:
-            Whether the model is instruction-tuned.
+        generative_type:
+            The generative type of the model.
         always_populate_text_field:
             Whether to always populate the 'text' field in the examples, as opposed to
             the 'messages' field.
@@ -198,7 +201,11 @@ def apply_prompt(
         The example with the few-shot examples applied.
     """
     # Sanity check
-    if instruction_model and always_populate_text_field and tokeniser is None:
+    if (
+        generative_type in {GenerativeType.INSTRUCTION_TUNED, GenerativeType.REASONING}
+        and always_populate_text_field
+        and tokeniser is None
+    ):
         raise ValueError(
             "The `tokeniser` argument must be provided when the model is instruction "
             "tuned and when we are not just returning the raw messages."
@@ -222,7 +229,10 @@ def apply_prompt(
         )
         label_mapping = dataset_config.prompt_label_mapping
         label = label_mapping.get(label, label)
-        if instruction_model:
+        if generative_type in {
+            GenerativeType.INSTRUCTION_TUNED,
+            GenerativeType.REASONING,
+        }:
             prompt = dataset_config.instruction_prompt.format(**kwargs)
             return prompt, label
         else:
@@ -348,7 +358,7 @@ def apply_prompt(
                 f"Unsupported task group: {dataset_config.task.task_group}."
             )
-    if instruction_model:
+    if generative_type in {GenerativeType.INSTRUCTION_TUNED, GenerativeType.REASONING}:
         few_shot_messages = [
             dict(role=role, content=content)
             for prompt, label in few_shot_sections
@@ -362,7 +372,6 @@ def apply_prompt(
         if not always_populate_text_field:
             examples["messages"] = messages_list
         else:
             assert tokeniser is not None
@@ -389,6 +398,9 @@ def apply_prompt(
                 apply_chat_template(
                     conversation=messages,
                     tokeniser=tokeniser,
+                    tokenise=False,
+                    add_generation_prompt=True,
+                    enable_thinking=(generative_type == GenerativeType.REASONING),
                     chat_template=chat_template,
                 )
                 for messages in messages_list
@@ -399,7 +411,10 @@ def apply_prompt(
     else:
         prompt_prefix = ""
         if dataset_config.prompt_prefix:
-            prompt_prefix = dataset_config.prompt_prefix + "\n\n"
+            labels_str = dataset_config.get_labels_str()
+            prompt_prefix = (
+                dataset_config.prompt_prefix.format(labels_str=labels_str) + "\n\n"
+            )
         few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
         if few_shot_prompt:
@@ -414,3 +429,42 @@ def apply_prompt(
     examples["prompt"] = [new_prompt for new_prompt, _ in new_sections]
     return examples
+def raise_if_wrong_params(
+    model_config: "ModelConfig", allowed_params: dict[re.Pattern, list[str]]
+) -> None:
+    """Raise an error if the model configuration has invalid parameters.
+    Args:
+        model_config:
+            The model configuration.
+        allowed_params:
+            The allowed parameters for the model, being a dictionary mapping a regex
+            pattern matching the model ID to a list of allowed parameters for those
+            models.
+    Raises:
+        InvalidModel:
+            If the model configuration has invalid parameters.
+    """
+    if model_config.param is None:
+        return
+    for model_regex, allowed_params_list in allowed_params.items():
+        if re.fullmatch(pattern=model_regex, string=model_config.model_id):
+            if model_config.param not in allowed_params_list:
+                msg = (
+                    f"Invalid parameter {model_config.param!r} for model "
+                    f"{model_config.model_id!r}."
+                )
+                if allowed_params_list:
+                    msg += f" Allowed parameters are: {', '.join(allowed_params_list)}."
+                else:
+                    msg += " No parameters are allowed."
+                raise InvalidModel(msg)
+            return
+    else:
+        raise InvalidModel(
+            f"The parameter {model_config.param!r} is not supported for the model "
+            f"{model_config.model_id!r}."
+        )

euroeval/metrics/pipeline.py CHANGED Viewed

@@ -217,7 +217,7 @@ def european_values_preprocessing_fn(
         )
         # Double check that we reshaped the predictions correctly
-        for idx, pred in enumerate(predictions):
+        for idx, pred in enumerate(integer_predictions):
             assert arr[idx // 5, idx % 5] == pred, (
                 f"Reshaped predictions do not match the original predictions at index "
                 f"{idx}: {arr[idx // 5, idx % 5]} != {pred}."

EuroEval 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl

Potentially problematic release.

EuroEval 16.0.1py3-none-any.whl → 16.1.1py3-none-any.whl