PyPI - EuroEval - Versions diffs - 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl - Mend

EuroEval 16.0.0py3-none-any.whl → 16.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (51) hide show

euroeval/__init__.py +5 -0
euroeval/benchmark_config_factory.py +6 -1
euroeval/benchmark_modules/base.py +2 -0
euroeval/benchmark_modules/fresh.py +7 -1
euroeval/benchmark_modules/hf.py +26 -21
euroeval/benchmark_modules/litellm.py +258 -131
euroeval/benchmark_modules/vllm.py +120 -68
euroeval/benchmarker.py +11 -2
euroeval/cli.py +14 -1
euroeval/constants.py +7 -1
euroeval/data_models.py +95 -20
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +14 -3
euroeval/dataset_configs/dutch.py +14 -0
euroeval/dataset_configs/english.py +22 -0
euroeval/dataset_configs/estonian.py +15 -7
euroeval/dataset_configs/finnish.py +14 -0
euroeval/dataset_configs/french.py +14 -0
euroeval/dataset_configs/german.py +23 -0
euroeval/dataset_configs/italian.py +14 -0
euroeval/dataset_configs/latvian.py +14 -0
euroeval/dataset_configs/norwegian.py +14 -0
euroeval/dataset_configs/polish.py +126 -0
euroeval/dataset_configs/portuguese.py +14 -0
euroeval/dataset_configs/spanish.py +14 -0
euroeval/dataset_configs/swedish.py +25 -0
euroeval/enums.py +12 -0
euroeval/generation.py +17 -8
euroeval/generation_utils.py +102 -16
euroeval/metrics/pipeline.py +51 -9
euroeval/model_cache.py +13 -1
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +27 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +11 -0
euroeval/prompt_templates/sentiment_classification.py +15 -0
euroeval/prompt_templates/summarization.py +27 -1
euroeval/scores.py +5 -0
euroeval/task_group_utils/multiple_choice_classification.py +2 -2
euroeval/task_group_utils/question_answering.py +29 -29
euroeval/task_group_utils/sequence_classification.py +71 -81
euroeval/task_group_utils/token_classification.py +17 -3
euroeval/tasks.py +12 -10
euroeval/{tokenization_utils.py → tokenisation_utils.py} +41 -25
euroeval/utils.py +67 -3
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/METADATA +3 -1
euroeval-16.1.0.dist-info/RECORD +70 -0
euroeval-16.0.0.dist-info/RECORD +0 -69
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0

euroeval/dataset_configs/polish.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""All Polish dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..enums import ModelType
+from ..languages import PL
+from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
+### Official datasets ###
+POLEMO2_CONFIG = DatasetConfig(
+    name="polemo2",
+    pretty_name="the Polish sentiment classification dataset PolEmo2",
+    huggingface_id="EuroEval/polemo2-mini",
+    task=SENT,
+    languages=[PL],
+)
+SCALA_PL_CONFIG = DatasetConfig(
+    name="scala-pl",
+    pretty_name="the Polish part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-pl",
+    task=LA,
+    languages=[PL],
+)
+KPWR_NER_CONFIG = DatasetConfig(
+    name="kpwr-ner",
+    pretty_name="the Polish entity recognition dataset KPWr-NER",
+    huggingface_id="EuroEval/kpwr-ner",
+    task=NER,
+    languages=[PL],
+)
+POQUAD_CONFIG = DatasetConfig(
+    name="poquad",
+    pretty_name="the Polish question answering dataset PoQuAD",
+    huggingface_id="EuroEval/poquad-mini",
+    task=RC,
+    languages=[PL],
+)
+PSC_CONFIG = DatasetConfig(
+    name="psc",
+    pretty_name="the Polish summarisation dataset PSC",
+    huggingface_id="EuroEval/psc-mini",
+    task=SUMM,
+    languages=[PL],
+)
+LLMZSZL_CONFIG = DatasetConfig(
+    name="llmzszl",
+    pretty_name="the Polish knowledge dataset LLMzSzŁ",
+    huggingface_id="EuroEval/llmzszl-mini",
+    task=KNOW,
+    languages=[PL],
+)
+WINOGRANDE_PL_CONFIG = DatasetConfig(
+    name="winogrande-pl",
+    pretty_name="the Polish common-sense reasoning dataset Winogrande-pl, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-pl",
+    task=COMMON_SENSE,
+    languages=[PL],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+)
+EUROPEAN_VALUES_PL_CONFIG = DatasetConfig(
+    name="european-values-pl",
+    pretty_name="the Polish version of the European values evaluation dataset",
+    huggingface_id="EuroEval/european-values-pl",
+    task=EUROPEAN_VALUES,
+    languages=[PL],
+    splits=["test"],
+    bootstrap_samples=False,
+    _instruction_prompt="{text}",
+)
+### Unofficial datasets ###
+MULTI_WIKI_QA_PL_CONFIG = DatasetConfig(
+    name="multi-wiki-qa-pl",
+    pretty_name="the truncated version of the Polish part of the reading "
+    "comprehension dataset MultiWikiQA",
+    huggingface_id="EuroEval/multi-wiki-qa-pl-mini",
+    task=RC,
+    languages=[PL],
+    unofficial=True,
+)
+GOLDENSWAG_PL_CONFIG = DatasetConfig(
+    name="goldenswag-pl",
+    pretty_name="the truncated version of the Polish common-sense reasoning "
+    "dataset GoldenSwag-pl, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-pl-mini",
+    task=COMMON_SENSE,
+    languages=[PL],
+    unofficial=True,
+)
+EUROPEAN_VALUES_SITUATIONAL_PL_CONFIG = DatasetConfig(
+    name="european-values-situational-pl",
+    pretty_name="the Polish version of the European values evaluation dataset, where "
+    "the questions are phrased in a situational way",
+    huggingface_id="EuroEval/european-values-situational-pl",
+    task=EUROPEAN_VALUES,
+    languages=[PL],
+    splits=["test"],
+    bootstrap_samples=False,
+    unofficial=True,
+)
+EUROPEAN_VALUES_COMPLETIONS_PL_CONFIG = DatasetConfig(
+    name="european-values-completions-pl",
+    pretty_name="the Polish version of the European values evaluation dataset, where "
+    "the questions are phrased as sentence completions",
+    huggingface_id="EuroEval/european-values-completions-pl",
+    task=EUROPEAN_VALUES,
+    languages=[PL],
+    splits=["test"],
+    bootstrap_samples=False,
+    unofficial=True,
+)

euroeval/dataset_configs/portuguese.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Portuguese dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import PT
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -91,6 +92,19 @@ BOOLQ_PT_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_PT_CONFIG = DatasetConfig(
+    name="winogrande-pt",
+    pretty_name="the Portuguese common-sense reasoning dataset Winogrande-pt, "
+    "translated from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-pt",
+    task=COMMON_SENSE,
+    languages=[PT],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_PT_CONFIG = DatasetConfig(
     name="european-values-situational-pt",
     pretty_name="the Portuguese version of the European values evaluation dataset, "

euroeval/dataset_configs/spanish.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Spanish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import ES
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -119,6 +120,19 @@ GOLDENSWAG_ES_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_ES_CONFIG = DatasetConfig(
+    name="winogrande-es",
+    pretty_name="the Spanish common-sense reasoning dataset Winogrande-es, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-es",
+    task=COMMON_SENSE,
+    languages=[ES],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_ES_CONFIG = DatasetConfig(
     name="european-values-situational-es",
     pretty_name="the Spanish version of the European values evaluation dataset, where "

euroeval/dataset_configs/swedish.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All Swedish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
+from ..enums import ModelType
 from ..languages import SV
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -130,6 +131,19 @@ GOLDENSWAG_SV_CONFIG = DatasetConfig(
     unofficial=True,
 )
+WINOGRANDE_SV_CONFIG = DatasetConfig(
+    name="winogrande-sv",
+    pretty_name="the Swedish common-sense reasoning dataset Winogrande-sv, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-sv",
+    task=COMMON_SENSE,
+    languages=[SV],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)
 EUROPEAN_VALUES_SITUATIONAL_SV_CONFIG = DatasetConfig(
     name="european-values-situational-sv",
     pretty_name="the Swedish version of the European values evaluation dataset, where "
@@ -155,3 +169,14 @@ EUROPEAN_VALUES_COMPLETIONS_SV_CONFIG = DatasetConfig(
     _instruction_prompt="{text}",
     unofficial=True,
 )
+SKOLPROV_CONFIG = DatasetConfig(
+    name="skolprov",
+    pretty_name="the Swedish knowledge dataset Skolprov",
+    huggingface_id="EuroEval/skolprov",
+    task=KNOW,
+    languages=[SV],
+    splits=["train", "test"],
+    _allowed_model_types=[ModelType.GENERATIVE],
+    unofficial=True,
+)

euroeval/enums.py CHANGED Viewed

@@ -12,6 +12,14 @@ class AutoStrEnum(str, Enum):
     ) -> str:
         return name.lower()
+    def __str__(self) -> str:
+        """Return the value in upper case for better readability."""
+        return self.value.upper()
+    def __repr__(self) -> str:
+        """Return the value in upper case for better readability."""
+        return self.value.upper()
 class Device(AutoStrEnum):
     """The compute device to use for the evaluation.
@@ -60,6 +68,10 @@ class ModelType(AutoStrEnum):
     ENCODER = auto()
     GENERATIVE = auto()
+    def __repr__(self) -> str:
+        """Return the value in upper case for better readability."""
+        return self.value.upper()
 class GenerativeType(AutoStrEnum):
     """The type of a generative model.

euroeval/generation.py CHANGED Viewed

@@ -307,7 +307,7 @@ def debug_log(
                     for label in batch["label"]
                 ]
             else:
-                labels = ["N/A"] * len(extracted_labels)
+                labels = [None] * len(extracted_labels)
         case TaskGroup.QUESTION_ANSWERING:
             extracted_labels = [
@@ -330,12 +330,21 @@ def debug_log(
     else:
         input_texts = batch["text"]
-    for input_text, raw_output, prediction, label in zip(
-        input_texts, model_output.sequences, extracted_labels, labels
-    ):
+    metadata_keys: list[str] = [
+        key
+        for key in batch.keys()
+        if key not in ["text", "messages", "label", "labels", "target_text"]
+    ]
+    for idx in range(len(input_texts)):
+        data_to_log: dict[str, t.Any] = {
+            "Input": input_texts[idx],
+            "Raw output": model_output.sequences[idx],
+            "Prediction": extracted_labels[idx],
+        }
+        if labels[idx]:
+            data_to_log["Label"] = labels[idx]
+        data_to_log |= {key.capitalize(): batch[key][idx] for key in metadata_keys}
         logger.info(
-            f"Input: '{input_text}'\n"
-            f"Raw output: '{raw_output}'\n"
-            f"Prediction: '{prediction}'\n"
-            f"Label: '{label}'"
+            "\n".join(f"{key}: {value!r}" for key, value in data_to_log.items())
         )

euroeval/generation_utils.py CHANGED Viewed

@@ -4,12 +4,13 @@ import itertools as it
 import json
 import logging
 import random
+import re
 import typing as t
-from .enums import TaskGroup
-from .exceptions import InvalidBenchmark
-from .tokenization_utils import apply_chat_template
-from .utils import log_once
+from .enums import GenerativeType, TaskGroup
+from .exceptions import InvalidBenchmark, InvalidModel
+from .tokenisation_utils import apply_chat_template
+from .utils import extract_multiple_choice_labels, log_once
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -173,7 +174,7 @@ def apply_prompt(
     few_shot_examples: list[dict[str, t.Any]],
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
-    instruction_model: bool,
+    generative_type: GenerativeType | None,
     always_populate_text_field: bool,
     tokeniser: "PreTrainedTokenizer | None",
 ) -> dict[str, t.Any]:
@@ -184,10 +185,12 @@ def apply_prompt(
             The examples to apply the few-shot examples to.
         few_shot_examples:
             The few-shot examples to apply.
+        model_config:
+            The model configuration.
         dataset_config:
             The dataset configuration.
-        instruction_model:
-            Whether the model is instruction-tuned.
+        generative_type:
+            The generative type of the model.
         always_populate_text_field:
             Whether to always populate the 'text' field in the examples, as opposed to
             the 'messages' field.
@@ -198,7 +201,11 @@ def apply_prompt(
         The example with the few-shot examples applied.
     """
     # Sanity check
-    if instruction_model and always_populate_text_field and tokeniser is None:
+    if (
+        generative_type == GenerativeType.INSTRUCTION_TUNED
+        and always_populate_text_field
+        and tokeniser is None
+    ):
         raise ValueError(
             "The `tokeniser` argument must be provided when the model is instruction "
             "tuned and when we are not just returning the raw messages."
@@ -222,7 +229,7 @@ def apply_prompt(
         )
         label_mapping = dataset_config.prompt_label_mapping
         label = label_mapping.get(label, label)
-        if instruction_model:
+        if generative_type == GenerativeType.INSTRUCTION_TUNED:
             prompt = dataset_config.instruction_prompt.format(**kwargs)
             return prompt, label
         else:
@@ -230,18 +237,49 @@ def apply_prompt(
             return dataset_config.prompt_template.format(**kwargs), ""
     match dataset_config.task.task_group:
-        case (
-            TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
-        ):
+        case TaskGroup.SEQUENCE_CLASSIFICATION:
+            labels_str = dataset_config.get_labels_str()
             few_shot_sections = [
                 create_prompt(
                     text=example["text"].replace("\n", " ").strip(),
                     label=example["label"].replace("\n", " ").strip(),
+                    labels_str=labels_str,
                 )
                 for example in few_shot_examples
             ]
             new_sections = [
-                create_prompt(text=text.replace("\n", " ").strip(), label="")
+                create_prompt(
+                    text=text.replace("\n", " ").strip(),
+                    label="",
+                    labels_str=labels_str,
+                )
+                for text in examples["text"]
+            ]
+        case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
+            few_shot_sections = [
+                create_prompt(
+                    text=example["text"].replace("\n", " ").strip(),
+                    label=example["label"].replace("\n", " ").strip(),
+                    labels_str=dataset_config.get_labels_str(
+                        labels=extract_multiple_choice_labels(
+                            prompt=example["text"],
+                            candidate_labels=dataset_config.labels,
+                        )
+                    ),
+                )
+                for example in few_shot_examples
+            ]
+            new_sections = [
+                create_prompt(
+                    text=text.replace("\n", " ").strip(),
+                    label="",
+                    labels_str=dataset_config.get_labels_str(
+                        labels=extract_multiple_choice_labels(
+                            prompt=text, candidate_labels=dataset_config.labels
+                        )
+                    ),
+                )
                 for text in examples["text"]
             ]
@@ -259,6 +297,7 @@ def apply_prompt(
             ]
         case TaskGroup.TOKEN_CLASSIFICATION:
+            labels_str = dataset_config.get_labels_str()
             def create_label(example: dict) -> str:
                 prompt_labels = dataset_config.prompt_label_mapping.values()
@@ -280,12 +319,15 @@ def apply_prompt(
                 create_prompt(
                     text=" ".join(example["tokens"]).replace("\n", " ").strip(),
                     label=create_label(example=example),
+                    labels_str=labels_str,
                 )
                 for example in few_shot_examples
             ]
             new_sections = [
                 create_prompt(
-                    text=" ".join(tokens).replace("\n", " ").strip(), label=""
+                    text=" ".join(tokens).replace("\n", " ").strip(),
+                    label="",
+                    labels_str=labels_str,
                 )
                 for tokens in examples["tokens"]
             ]
@@ -313,7 +355,7 @@ def apply_prompt(
                 f"Unsupported task group: {dataset_config.task.task_group}."
             )
-    if instruction_model:
+    if generative_type == GenerativeType.INSTRUCTION_TUNED:
         few_shot_messages = [
             dict(role=role, content=content)
             for prompt, label in few_shot_sections
@@ -327,7 +369,6 @@ def apply_prompt(
         if not always_populate_text_field:
             examples["messages"] = messages_list
         else:
             assert tokeniser is not None
@@ -354,6 +395,9 @@ def apply_prompt(
                 apply_chat_template(
                     conversation=messages,
                     tokeniser=tokeniser,
+                    tokenise=False,
+                    add_generation_prompt=True,
+                    enable_thinking=(generative_type == GenerativeType.REASONING),
                     chat_template=chat_template,
                 )
                 for messages in messages_list
@@ -375,4 +419,46 @@ def apply_prompt(
             for new_prompt, _ in new_sections
         ]
+    # Always add the final prompts without few-shot examples, too, for analysis
+    examples["prompt"] = [new_prompt for new_prompt, _ in new_sections]
     return examples
+def raise_if_wrong_params(
+    model_config: "ModelConfig", allowed_params: dict[re.Pattern, list[str]]
+) -> None:
+    """Raise an error if the model configuration has invalid parameters.
+    Args:
+        model_config:
+            The model configuration.
+        allowed_params:
+            The allowed parameters for the model, being a dictionary mapping a regex
+            pattern matching the model ID to a list of allowed parameters for those
+            models.
+    Raises:
+        InvalidModel:
+            If the model configuration has invalid parameters.
+    """
+    if model_config.param is None:
+        return
+    for model_regex, allowed_params_list in allowed_params.items():
+        if re.fullmatch(pattern=model_regex, string=model_config.model_id):
+            if model_config.param not in allowed_params_list:
+                msg = (
+                    f"Invalid parameter {model_config.param!r} for model "
+                    f"{model_config.model_id!r}."
+                )
+                if allowed_params_list:
+                    msg += f" Allowed parameters are: {', '.join(allowed_params_list)}."
+                else:
+                    msg += " No parameters are allowed."
+                raise InvalidModel(msg)
+            return
+    else:
+        raise InvalidModel(
+            f"The parameter {model_config.param!r} is not supported for the model "
+            f"{model_config.model_id!r}."
+        )

euroeval/metrics/pipeline.py CHANGED Viewed

@@ -26,6 +26,27 @@ logger: logging.Logger = logging.getLogger("euroeval")
 T = t.TypeVar("T", bound=int | float | str | bool)
+class PreprocessingFunction(t.Protocol):
+    """A protocol for a preprocessing function."""
+    def __call__(
+        self, predictions: c.Sequence[int], dataset: "Dataset"
+    ) -> c.Sequence[int]:
+        """Preprocess the model predictions before they are passed to the pipeline.
+        Args:
+            predictions:
+                The model predictions.
+            dataset:
+                The dataset used for evaluation. This is only used in case any
+                additional metadata is used to compute the metrics.
+        Returns:
+            The preprocessed model predictions.
+        """
+        ...
 class PipelineMetric(Metric):
     """Load a scikit-learn pipeline and use it to get scores from the predictions."""
@@ -36,7 +57,7 @@ class PipelineMetric(Metric):
         pipeline_repo: str,
         pipeline_scoring_function: c.Callable[["Pipeline", c.Sequence], float],
         pipeline_file_name: str = "pipeline.pkl",
-        preprocessing_fn: c.Callable[[c.Sequence[T]], c.Sequence[T]] = lambda x: x,
+        preprocessing_fn: PreprocessingFunction | None = None,
         postprocessing_fn: c.Callable[[float], tuple[float, str]] | None = None,
     ) -> None:
         """Initialise the pipeline transform metric.
@@ -101,7 +122,10 @@ class PipelineMetric(Metric):
         """
         if self.pipeline is None:
             self.pipeline = self._download_pipeline()
-        predictions = self.preprocessing_fn(predictions)
+        if self.preprocessing_fn is not None:
+            predictions = self.preprocessing_fn(
+                predictions=predictions, dataset=dataset
+            )
         return self.pipeline_scoring_function(self.pipeline, predictions)
     def _download_pipeline(self) -> "Pipeline":
@@ -133,13 +157,18 @@ class PipelineMetric(Metric):
 ### European Values Metric ###
-def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence[int]:
+def european_values_preprocessing_fn(
+    predictions: c.Sequence[int], dataset: "Dataset"
+) -> c.Sequence[int]:
     """Preprocess the model predictions for the European Values metric.
     Args:
         predictions:
             The model predictions, a sequence of integers representing the predicted
             choices for each question.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         The preprocessed model predictions, a sequence of integers representing the
@@ -154,6 +183,17 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
     num_questions = 53
     num_phrasings_per_question = 5
+    # Convert the predictions to integers
+    integer_predictions = []
+    for prediction, idx_to_choice in zip(predictions, dataset["idx_to_choice"]):
+        idx_to_choice = {
+            int(idx): int(choice)
+            for idx, choice in idx_to_choice.items()
+            if choice is not None
+        }
+        integer_prediction = idx_to_choice[prediction]
+        integer_predictions.append(integer_prediction)
     assert len(predictions) % num_questions == 0, (
         f"The number of predictions ({len(predictions)}) is not a multiple of "
         f"{num_questions}, which is required for the European Values metric."
@@ -171,13 +211,13 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
         # Shape: (num_questions, num_phrasings_per_question)
         arr = np.array(
             [
-                predictions[i : i + num_phrasings_per_question]
+                integer_predictions[i : i + num_phrasings_per_question]
                 for i in range(0, len(predictions), num_phrasings_per_question)
             ]
         )
         # Double check that we reshaped the predictions correctly
-        for idx, pred in enumerate(predictions):
+        for idx, pred in enumerate(integer_predictions):
             assert arr[idx // 5, idx % 5] == pred, (
                 f"Reshaped predictions do not match the original predictions at index "
                 f"{idx}: {arr[idx // 5, idx % 5]} != {pred}."
@@ -188,7 +228,7 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
         arr = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=arr)
         # Convert the array to a list
-        predictions = arr.tolist()
+        integer_predictions = arr.tolist()
     # Some of the questions are categorical and we're only interested in whether the
     # model chooses a specific choice or not. This mapping takes the question index
@@ -208,11 +248,13 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
     }
     # Map the predictions to the choices we're interested in
-    predictions = list(predictions)
+    integer_predictions = list(integer_predictions)
     for question_idx, choice in question_choices.items():
-        predictions[question_idx] = 1 if predictions[question_idx] == choice else 0
+        integer_predictions[question_idx] = (
+            1 if integer_predictions[question_idx] == choice else 0
+        )
-    return predictions
+    return integer_predictions
 def european_values_scoring_function(

euroeval/model_cache.py CHANGED Viewed

@@ -10,7 +10,9 @@ from dataclasses import asdict
 from tqdm.auto import tqdm
+from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
 from .data_models import GenerativeModelOutput, SingleGenerativeModelOutput
+from .utils import log_once
 if t.TYPE_CHECKING:
     from pathlib import Path
@@ -189,10 +191,20 @@ class ModelCache:
                 # the indices of the top scores, to save space. Further, we only store
                 # the scores if the generated sequence is shorter than the maximum
                 # length
-                if model_output.scores is not None and self.max_generated_tokens < 8:
+                if (
+                    model_output.scores is not None
+                    and self.max_generated_tokens
+                    <= NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
+                ):
                     assert model_output.scores is not None
                     scores = model_output.scores[sample_idx]
                 else:
+                    if model_output.scores is not None:
+                        log_once(
+                            "The generated sequence is longer than the maximum "
+                            "length for classification. Not caching the scores.",
+                            level=logging.DEBUG,
+                        )
                     scores = None
                 self[model_input] = SingleGenerativeModelOutput(
                     sequence=model_output.sequences[sample_idx], scores=scores

euroeval/prompt_templates/linguistic_acceptability.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ..languages import (
     NL,
     NN,
     NO,
+    PL,
     PT,
     SV,
 )
@@ -67,6 +68,14 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
         default_instruction_prompt="Lause: {text}\n\nOtsusta, kas lause on "
         "grammatiliselt õige või mitte. Vasta {labels_str}, ja mitte midagi muud.",
     ),
+    PL: PromptConfig(
+        default_prompt_label_mapping=dict(correct="tak", incorrect="nie"),
+        default_prompt_prefix="Poniżej znajdują się teksty i czy są "
+        "gramatycznie poprawne.",
+        default_prompt_template="Tekst: {text}\nGramatycznie poprawny: {label}",
+        default_instruction_prompt="Tekst: {text}\n\nOkreśl czy tekst jest "
+        "gramatycznie poprawny czy nie. Odpowiedz {labels_str}, i nic więcej.",
+    ),
     PT: PromptConfig(
         default_prompt_label_mapping=dict(correct="sim", incorrect="não"),
         default_prompt_prefix="Seguem-se abaixo textos e se são "

EuroEval 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.0.0py3-none-any.whl → 16.1.0py3-none-any.whl