PyPI - EuroEval - Versions diffs - 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl - Mend

EuroEval 15.16.0py3-none-any.whl → 16.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show

euroeval/__init__.py +8 -7
euroeval/benchmark_config_factory.py +3 -7
euroeval/benchmark_modules/base.py +35 -19
euroeval/benchmark_modules/fresh.py +24 -19
euroeval/benchmark_modules/hf.py +136 -154
euroeval/benchmark_modules/litellm.py +190 -110
euroeval/benchmark_modules/vllm.py +199 -139
euroeval/benchmarker.py +49 -22
euroeval/cli.py +3 -3
euroeval/constants.py +19 -15
euroeval/data_loading.py +33 -28
euroeval/data_models.py +73 -23
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/danish.py +35 -1
euroeval/dataset_configs/dutch.py +38 -1
euroeval/dataset_configs/english.py +38 -1
euroeval/dataset_configs/estonian.py +95 -0
euroeval/dataset_configs/faroese.py +38 -0
euroeval/dataset_configs/finnish.py +39 -1
euroeval/dataset_configs/french.py +38 -1
euroeval/dataset_configs/german.py +38 -1
euroeval/dataset_configs/icelandic.py +39 -1
euroeval/dataset_configs/italian.py +38 -1
euroeval/dataset_configs/latvian.py +81 -0
euroeval/dataset_configs/norwegian.py +38 -1
euroeval/dataset_configs/portuguese.py +38 -1
euroeval/dataset_configs/spanish.py +38 -1
euroeval/dataset_configs/swedish.py +38 -1
euroeval/enums.py +0 -6
euroeval/finetuning.py +6 -6
euroeval/generation.py +25 -14
euroeval/generation_utils.py +90 -20
euroeval/languages.py +947 -187
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +76 -0
euroeval/metrics/huggingface.py +192 -0
euroeval/metrics/llm_as_a_judge.py +257 -0
euroeval/metrics/pipeline.py +276 -0
euroeval/metrics/speed.py +51 -0
euroeval/model_cache.py +13 -1
euroeval/prompt_templates/linguistic_acceptability.py +40 -2
euroeval/prompt_templates/multiple_choice.py +23 -2
euroeval/prompt_templates/named_entity_recognition.py +65 -2
euroeval/prompt_templates/reading_comprehension.py +42 -2
euroeval/prompt_templates/sentiment_classification.py +46 -2
euroeval/prompt_templates/summarization.py +24 -4
euroeval/scores.py +7 -2
euroeval/speed_benchmark.py +6 -6
euroeval/task_group_utils/multiple_choice_classification.py +19 -8
euroeval/task_group_utils/question_answering.py +35 -28
euroeval/task_group_utils/sequence_classification.py +128 -42
euroeval/task_group_utils/text_to_text.py +7 -3
euroeval/task_group_utils/token_classification.py +59 -73
euroeval/tasks.py +33 -6
euroeval/tokenization_utils.py +294 -207
euroeval/utils.py +150 -35
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
euroeval-16.0.1.dist-info/RECORD +69 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -738
euroeval/metrics.py +0 -470
euroeval-15.16.0.dist-info/RECORD +0 -63
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0

euroeval/metrics/pipeline.py ADDED Viewed

@@ -0,0 +1,276 @@
+"""Metrics based on a scikit-learn Pipeline."""
+import collections.abc as c
+import logging
+import typing as t
+from pathlib import Path
+import cloudpickle
+import huggingface_hub as hf_hub
+import numpy as np
+from scipy.special import expit as sigmoid
+from ..exceptions import InvalidBenchmark
+from ..utils import unscramble
+from .base import Metric
+if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
+    from sklearn.pipeline import Pipeline
+    from ..data_models import BenchmarkConfig, DatasetConfig
+logger: logging.Logger = logging.getLogger("euroeval")
+T = t.TypeVar("T", bound=int | float | str | bool)
+class PreprocessingFunction(t.Protocol):
+    """A protocol for a preprocessing function."""
+    def __call__(
+        self, predictions: c.Sequence[int], dataset: "Dataset"
+    ) -> c.Sequence[int]:
+        """Preprocess the model predictions before they are passed to the pipeline.
+        Args:
+            predictions:
+                The model predictions.
+            dataset:
+                The dataset used for evaluation. This is only used in case any
+                additional metadata is used to compute the metrics.
+        Returns:
+            The preprocessed model predictions.
+        """
+        ...
+class PipelineMetric(Metric):
+    """Load a scikit-learn pipeline and use it to get scores from the predictions."""
+    def __init__(
+        self,
+        name: str,
+        pretty_name: str,
+        pipeline_repo: str,
+        pipeline_scoring_function: c.Callable[["Pipeline", c.Sequence], float],
+        pipeline_file_name: str = "pipeline.pkl",
+        preprocessing_fn: PreprocessingFunction | None = None,
+        postprocessing_fn: c.Callable[[float], tuple[float, str]] | None = None,
+    ) -> None:
+        """Initialise the pipeline transform metric.
+        Args:
+            name:
+                The name of the metric in snake_case.
+            pretty_name:
+                The pretty name of the metric, used for display purposes.
+            pipeline_repo:
+                The Hugging Face repository ID of the scikit-learn pipeline to load.
+            pipeline_scoring_method:
+                The method to use for scoring the predictions with the pipeline. Takes
+                a 1D sequence of predictions and returns a float score.
+            pipeline_file_name (optional):
+                The name of the file to download from the Hugging Face repository.
+                Defaults to "pipeline.joblib".
+            preprocessing_fn (optional):
+                A function to apply to the predictions before they are passed to the
+                pipeline. This is useful for preprocessing the predictions to match
+                the expected input format of the pipeline. Defaults to a no-op function
+                that returns the input unchanged.
+            postprocessing_fn (optional):
+                A function to apply to the metric scores after they are computed,
+                taking the score to the postprocessed score along with its string
+                representation. Defaults to x -> (100 * x, f"{x:.2%}").
+        """
+        super().__init__(
+            name=name, pretty_name=pretty_name, postprocessing_fn=postprocessing_fn
+        )
+        self.pipeline_repo = pipeline_repo
+        self.pipeline_file_name = pipeline_file_name
+        self.pipeline_scoring_function = pipeline_scoring_function
+        self.pipeline: "Pipeline | None" = None
+        self.preprocessing_fn = preprocessing_fn
+    def __call__(
+        self,
+        predictions: c.Sequence,
+        references: c.Sequence,
+        dataset: "Dataset",
+        dataset_config: "DatasetConfig",
+        benchmark_config: "BenchmarkConfig",
+    ) -> float | None:
+        """Calculate the metric score using the scikit-learn pipeline.
+        Args:
+            predictions:
+                The model predictions.
+            references:
+                Not used, but required for consistency with the Metric interface.
+            dataset:
+                The dataset used for evaluation. This is only used in case any
+                additional metadata is used to compute the metrics.
+            dataset_config:
+                The dataset configuration.
+            benchmark_config:
+                The benchmark configuration.
+        Returns:
+            The calculated metric score, or None if the score should be ignored.
+        """
+        if self.pipeline is None:
+            self.pipeline = self._download_pipeline()
+        if self.preprocessing_fn is not None:
+            predictions = self.preprocessing_fn(
+                predictions=predictions, dataset=dataset
+            )
+        return self.pipeline_scoring_function(self.pipeline, predictions)
+    def _download_pipeline(self) -> "Pipeline":
+        """Download the scikit-learn pipeline from the given URL.
+        Returns:
+            The downloaded scikit-learn pipeline.
+        Raises:
+            InvalidBenchmark:
+                If the loading of the pipeline fails for any reason.
+        """
+        logger.debug(f"Loading pipeline from {self.pipeline_repo}...")
+        folder_path = hf_hub.HfApi(
+            token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_")
+        ).snapshot_download(repo_id=self.pipeline_repo, repo_type="model")
+        model_path = Path(folder_path, self.pipeline_file_name)
+        try:
+            with model_path.open(mode="rb") as f:
+                pipeline = cloudpickle.load(f)
+        except Exception as e:
+            raise InvalidBenchmark(
+                f"Failed to load pipeline from {self.pipeline_repo!r}: {e}"
+            ) from e
+        logger.debug(f"Successfully loaded pipeline: {pipeline}")
+        return pipeline
+### European Values Metric ###
+def european_values_preprocessing_fn(
+    predictions: c.Sequence[int], dataset: "Dataset"
+) -> c.Sequence[int]:
+    """Preprocess the model predictions for the European Values metric.
+    Args:
+        predictions:
+            The model predictions, a sequence of integers representing the predicted
+            choices for each question.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
+    Returns:
+        The preprocessed model predictions, a sequence of integers representing the
+        final predicted choices for each question after any necessary aggregation and
+        mapping.
+    Raises:
+        AssertionError:
+            If the number of predictions is not a multiple of 53, which is required
+            for the European Values metric.
+    """
+    num_questions = 53
+    num_phrasings_per_question = 5
+    # Convert the predictions to integers
+    integer_predictions = []
+    for prediction, idx_to_choice in zip(predictions, dataset["idx_to_choice"]):
+        idx_to_choice = {
+            int(idx): int(choice)
+            for idx, choice in idx_to_choice.items()
+            if choice is not None
+        }
+        integer_prediction = idx_to_choice[prediction]
+        integer_predictions.append(integer_prediction)
+    assert len(predictions) % num_questions == 0, (
+        f"The number of predictions ({len(predictions)}) is not a multiple of "
+        f"{num_questions}, which is required for the European Values metric."
+    )
+    # When we are using the situational version of the dataset, there are 5 phrasings
+    # for each question, so we need to aggregate the predictions by question, which we
+    # do using majority voting.
+    using_situational = len(predictions) == num_questions * num_phrasings_per_question
+    if using_situational:
+        # Reshape the predictions to a 2D array with `num_phrasings_per_question` rows
+        # (one for each phrasing) and `num_questions` columns (one for each question).
+        # The five phrasings for each question appear right after each other, e.g.,
+        # (0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, ...)
+        # Shape: (num_questions, num_phrasings_per_question)
+        arr = np.array(
+            [
+                integer_predictions[i : i + num_phrasings_per_question]
+                for i in range(0, len(predictions), num_phrasings_per_question)
+            ]
+        )
+        # Double check that we reshaped the predictions correctly
+        for idx, pred in enumerate(predictions):
+            assert arr[idx // 5, idx % 5] == pred, (
+                f"Reshaped predictions do not match the original predictions at index "
+                f"{idx}: {arr[idx // 5, idx % 5]} != {pred}."
+            )
+        # Use majority voting to get the final prediction for each question
+        # Shape: (53,)
+        arr = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=arr)
+        # Convert the array to a list
+        integer_predictions = arr.tolist()
+    # Some of the questions are categorical and we're only interested in whether the
+    # model chooses a specific choice or not. This mapping takes the question index
+    # to the choice value that we're interested in.
+    question_choices = {
+        0: 1,
+        1: 5,
+        3: 3,
+        6: 1,
+        15: 4,
+        20: 2,
+        47: 8,
+        48: 7,
+        49: 4,
+        51: 4,
+        52: 4,
+    }
+    # Map the predictions to the choices we're interested in
+    integer_predictions = list(integer_predictions)
+    for question_idx, choice in question_choices.items():
+        integer_predictions[question_idx] = (
+            1 if integer_predictions[question_idx] == choice else 0
+        )
+    return integer_predictions
+def european_values_scoring_function(
+    pipeline: "Pipeline", predictions: c.Sequence[int]
+) -> float:
+    """Scoring function for the European Values metric."""
+    normalised_predictions = pipeline[0].transform([predictions])
+    log_likelihoods = pipeline[1].transform(normalised_predictions)[0]
+    score = sigmoid(pipeline[2].alpha_ * (log_likelihoods - pipeline[2].center_))
+    return score.item()
+european_values_metric = PipelineMetric(
+    name="european_values",
+    pretty_name="European Values",
+    pipeline_repo="EuroEval/european-values-pipeline",
+    pipeline_scoring_function=european_values_scoring_function,
+    preprocessing_fn=european_values_preprocessing_fn,
+)

euroeval/metrics/speed.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""Inference speed metric."""
+import collections.abc as c
+import logging
+import typing as t
+from .base import Metric
+if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
+    from ..data_models import BenchmarkConfig, DatasetConfig
+logger: logging.Logger = logging.getLogger("euroeval")
+class SpeedMetric(Metric):
+    """Speed metric."""
+    def __init__(self, name: str, pretty_name: str) -> None:
+        """Initialise the speed metric.
+        Args:
+            name:
+                The name of the metric in snake_case.
+            pretty_name:
+                The pretty name of the metric, used for display purposes.
+        """
+        super().__init__(
+            name=name,
+            pretty_name=pretty_name,
+            postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
+        )
+    def __call__(
+        self,
+        predictions: c.Sequence,
+        references: c.Sequence,
+        dataset: "Dataset",
+        dataset_config: "DatasetConfig",
+        benchmark_config: "BenchmarkConfig",
+    ) -> float | None:
+        """Not used with the speed metric, but required for consistency."""
+        raise NotImplementedError
+speed_metric = SpeedMetric(name="speed", pretty_name="Tokens per second")
+speed_short_metric = SpeedMetric(
+    name="speed_short", pretty_name="Tokens per second on short documents"
+)

euroeval/model_cache.py CHANGED Viewed

@@ -10,7 +10,9 @@ from dataclasses import asdict
 from tqdm.auto import tqdm
+from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
 from .data_models import GenerativeModelOutput, SingleGenerativeModelOutput
+from .utils import log_once
 if t.TYPE_CHECKING:
     from pathlib import Path
@@ -189,10 +191,20 @@ class ModelCache:
                 # the indices of the top scores, to save space. Further, we only store
                 # the scores if the generated sequence is shorter than the maximum
                 # length
-                if model_output.scores is not None and self.max_generated_tokens < 8:
+                if (
+                    model_output.scores is not None
+                    and self.max_generated_tokens
+                    <= NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
+                ):
                     assert model_output.scores is not None
                     scores = model_output.scores[sample_idx]
                 else:
+                    if model_output.scores is not None:
+                        log_once(
+                            "The generated sequence is longer than the maximum "
+                            "length for classification. Not caching the scores.",
+                            level=logging.DEBUG,
+                        )
                     scores = None
                 self[model_input] = SingleGenerativeModelOutput(
                     sequence=model_output.sequences[sample_idx], scores=scores

euroeval/prompt_templates/linguistic_acceptability.py CHANGED Viewed

@@ -1,9 +1,32 @@
 """Templates for the Linguistic Acceptability task."""
+import typing as t
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
+from ..languages import (
+    DA,
+    DE,
+    EN,
+    ES,
+    ET,
+    FI,
+    FO,
+    FR,
+    IS,
+    IT,
+    LV,
+    NB,
+    NL,
+    NN,
+    NO,
+    PT,
+    SV,
+)
+if t.TYPE_CHECKING:
+    from ..data_models import Language
-LA_TEMPLATES = {
+LA_TEMPLATES: dict["Language", PromptConfig] = {
     DA: PromptConfig(
         default_prompt_label_mapping=dict(correct="ja", incorrect="nej"),
         default_prompt_prefix="Følgende er sætninger og om de er grammatisk korrekte.",
@@ -36,6 +59,14 @@ LA_TEMPLATES = {
         default_instruction_prompt="Texto: {text}\n\nDetermina si el texto es "
         "gramaticalmente correcto o no. Responde con {labels_str}, y nada más.",
     ),
+    ET: PromptConfig(
+        default_prompt_label_mapping=dict(correct="jah", incorrect="ei"),
+        default_prompt_prefix="Järgnevad on laused ja kas need on grammatiliselt "
+        "õiged.",
+        default_prompt_template="Lause: {text}\nGrammatikaliselt õige: {label}",
+        default_instruction_prompt="Lause: {text}\n\nOtsusta, kas lause on "
+        "grammatiliselt õige või mitte. Vasta {labels_str}, ja mitte midagi muud.",
+    ),
     PT: PromptConfig(
         default_prompt_label_mapping=dict(correct="sim", incorrect="não"),
         default_prompt_prefix="Seguem-se abaixo textos e se são "
@@ -85,6 +116,13 @@ LA_TEMPLATES = {
         default_instruction_prompt="Frase: {text}\n\nStabilite se la frase è "
         "grammaticalmente corretta o meno. Rispondere con {labels_str}, e nient'altro.",
     ),
+    LV: PromptConfig(
+        default_prompt_label_mapping=dict(correct="jā", incorrect="nē"),
+        default_prompt_prefix="Šie ir teikumi un to gramatiskie pareizumi.",
+        default_prompt_template="Teikums: {text}\nGramatiski pareizs: {label}",
+        default_instruction_prompt="Teikums: {text}\n\nNoteiciet, vai teikums ir "
+        "gramatiski pareizs vai nē. Atbildiet ar {labels_str}, un neko citu.",
+    ),
     NB: PromptConfig(
         default_prompt_label_mapping=dict(correct="ja", incorrect="nei"),
         default_prompt_prefix="Følgende er setninger og hvorvidt de er grammatisk "

euroeval/prompt_templates/multiple_choice.py CHANGED Viewed

@@ -1,10 +1,15 @@
 """Templates for all multiple choice tasks."""
+import typing as t
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
+from ..languages import DA, DE, EN, ES, ET, FI, FR, IS, IT, LV, NB, NL, NN, NO, PT, SV
+if t.TYPE_CHECKING:
+    from ..data_models import Language
 # TODO: Missing Faroese
-MULTIPLE_CHOICE_TEMPLATES = {
+MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
     DA: PromptConfig(
         default_prompt_prefix="Følgende er multiple choice spørgsmål (med svar).",
         default_prompt_template="Spørgsmål: {text}\nSvar: {label}",
@@ -36,6 +41,14 @@ MULTIPLE_CHOICE_TEMPLATES = {
         "usando solo {labels_str}, y nada más.",
         default_prompt_label_mapping="auto",
     ),
+    ET: PromptConfig(
+        default_prompt_prefix="Järgnevad on vastusevariantidega küsimused (koos "
+        "vastustega).",
+        default_prompt_template="Küsimus: {text}\nVastus: {label}",
+        default_instruction_prompt="Küsimus: {text}\n\nVasta ülaltoodud küsimusele "
+        "ainult {labels_str}, ja mitte millegi muuga.",
+        default_prompt_label_mapping="auto",
+    ),
     PT: PromptConfig(
         default_prompt_prefix="As seguintes são perguntas de escolha múltipla "
         "(com respostas).",
@@ -74,6 +87,14 @@ MULTIPLE_CHOICE_TEMPLATES = {
         "precedente con {labels_str}, e nient'altro.",
         default_prompt_label_mapping="auto",
     ),
+    LV: PromptConfig(
+        default_prompt_prefix="Tālāk seko jautājumi ar vairākām atbilžu izvēlēm "
+        "(ar atbildēm).",
+        default_prompt_template="Jautājums: {text}\nAtbilde: {label}",
+        default_instruction_prompt="Jautājums: {text}\n\nAtbildiet uz iepriekšējo "
+        "jautājumu, atbildot ar {labels_str}, un nekas cits.",
+        default_prompt_label_mapping="auto",
+    ),
     NB: PromptConfig(
         default_prompt_prefix="Følgende er flervalgsspørsmål (med svar).",
         default_prompt_template="Spørsmål: {text}\nSvar: {label}",

euroeval/prompt_templates/named_entity_recognition.py CHANGED Viewed

@@ -1,9 +1,33 @@
 """Templates for the Named Entity Recognition task."""
+import typing as t
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
+from ..languages import (
+    DA,
+    DE,
+    EN,
+    ES,
+    ET,
+    FI,
+    FO,
+    FR,
+    IS,
+    IT,
+    LV,
+    NB,
+    NL,
+    NN,
+    NO,
+    PT,
+    SV,
+)
+if t.TYPE_CHECKING:
+    from ..data_models import Language
-NER_TEMPLATES = {
+NER_TEMPLATES: dict["Language", PromptConfig] = {
     DA: PromptConfig(
         default_prompt_label_mapping={
             "b-per": "person",
@@ -80,6 +104,25 @@ NER_TEMPLATES = {
         "claves {labels_str}. Los valores deben ser listas de las "
         "entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
     ),
+    ET: PromptConfig(
+        default_prompt_label_mapping={
+            "b-per": "inimene",
+            "i-per": "inimene",
+            "b-loc": "asukoht",
+            "i-loc": "asukoht",
+            "b-org": "organisatsioon",
+            "i-org": "organisatsioon",
+            "b-misc": "muu",
+            "i-misc": "muu",
+        },
+        default_prompt_prefix="Allpool on laused ja JSON-sõnastikud, mis sisaldavad "
+        "antud lauses esinevaid nimetatud üksuseid.",
+        default_prompt_template="Lause: {text}\nNimetatud üksused: {label}",
+        default_instruction_prompt="Lause: {text}\n\nTuvasta lauses "
+        "nimetatud üksused. Väljund peaks olema JSON-sõnastik, "
+        "mille võtmed on {labels_str}. Väärtused peaksid olema kindlat tüüpi nimetatud "
+        "üksuste loendid, täpselt nii nagu need lauses esinevad.",
+    ),
     PT: PromptConfig(
         default_prompt_label_mapping={
             "b-per": "pessoa",
@@ -197,6 +240,26 @@ NER_TEMPLATES = {
         "{labels_str}. I valori devono essere elenchi di entità "
         "nominate di quel tipo, esattamente come appaiono nella frase.",
     ),
+    LV: PromptConfig(
+        default_prompt_label_mapping={
+            "b-per": "persona",
+            "i-per": "persona",
+            "b-loc": "vieta",
+            "i-loc": "vieta",
+            "b-org": "organizācija",
+            "i-org": "organizācija",
+            "b-misc": "dažādi",
+            "i-misc": "dažādi",
+        },
+        default_prompt_prefix="Tālāk ir teikumi un JSON vārdnīcas ar nosauktajiem "
+        "objektiem, kas parādās dotajā teikumā.",
+        default_prompt_template="Teikums: {text}\nNosauktie objekti: {label}",
+        default_instruction_prompt="Teikums: {text}\n\n"
+        "Identificējiet nosauktos objektus "
+        "teikumā. Jums jāizvada šī informācija kā JSON vārdnīcu ar atslēgām "
+        "{labels_str}. Vērtībām jābūt šī tipa nosaukto objektu sarakstiem, "
+        "tieši tā, kā tie parādās teikumā.",
+    ),
     NB: PromptConfig(
         default_prompt_label_mapping={
             "b-per": "person",

euroeval/prompt_templates/reading_comprehension.py CHANGED Viewed

@@ -1,9 +1,32 @@
 """Templates for the Reading Comprehension task."""
+import typing as t
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
+from ..languages import (
+    DA,
+    DE,
+    EN,
+    ES,
+    ET,
+    FI,
+    FO,
+    FR,
+    IS,
+    IT,
+    LV,
+    NB,
+    NL,
+    NN,
+    NO,
+    PT,
+    SV,
+)
+if t.TYPE_CHECKING:
+    from ..data_models import Language
-RC_TEMPLATES = {
+RC_TEMPLATES: dict["Language", PromptConfig] = {
     DA: PromptConfig(
         default_prompt_prefix="Følgende er tekster med tilhørende spørgsmål og svar.",
         default_prompt_template="Tekst: {text}\nSpørgsmål: {question}\nSvar med maks. "
@@ -39,6 +62,14 @@ RC_TEMPLATES = {
         "sobre el texto anterior en máximo 3 palabras.\n\nPregunta: {question}",
         default_prompt_label_mapping=dict(),
     ),
+    ET: PromptConfig(
+        default_prompt_prefix="Järgnevad on tekstid koos küsimuste ja vastustega.",
+        default_prompt_template="Tekst: {text}\nKüsimus: {question}\nVasta "
+        "maksimaalselt 3 sõnaga: {label}",
+        default_instruction_prompt="Tekst: {text}\n\nVasta järgmisele küsimusele "
+        "ülevaltoodud teksti kohta maksimaalselt 3 sõnaga.\n\nKüsimus: {question}",
+        default_prompt_label_mapping=dict(),
+    ),
     FI: PromptConfig(
         default_prompt_prefix="Seuraavassa on tekstejä ja niihin liittyviä kysymyksiä "
         "ja vastauksia.",
@@ -84,6 +115,15 @@ RC_TEMPLATES = {
         "sul in un massimo di 3 parole.\n\nDomanda: {question}",
         default_prompt_label_mapping=dict(),
     ),
+    LV: PromptConfig(
+        default_prompt_prefix="Turpmāk seko teksti ar atbilstošiem jautājumiem un "
+        "atbildēm.",
+        default_prompt_template="Teksts: {text}\nJautājums: {question}\nAtbildēt ar "
+        "maksimāli 3 vārdiem: {label}",
+        default_instruction_prompt="Teksts: {text}\n\nAtbildiet uz šo jautājumu par "
+        "iepriekš minēto tekstu ar maksimāli 3 vārdiem.\n\nJautājums: {question}",
+        default_prompt_label_mapping=dict(),
+    ),
     NB: PromptConfig(
         default_prompt_prefix="Her følger tekster med tilhørende spørsmål og svar.",
         default_prompt_template="Tekst: {text}\nSpørsmål: {question}\nSvar på maks 3 "

euroeval/prompt_templates/sentiment_classification.py CHANGED Viewed

@@ -1,9 +1,32 @@
 """Templates for the Sentiment Analysis task."""
+import typing as t
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
+from ..languages import (
+    DA,
+    DE,
+    EN,
+    ES,
+    ET,
+    FI,
+    FO,
+    FR,
+    IS,
+    IT,
+    LV,
+    NB,
+    NL,
+    NN,
+    NO,
+    PT,
+    SV,
+)
+if t.TYPE_CHECKING:
+    from ..data_models import Language
-SENT_TEMPLATES = {
+SENT_TEMPLATES: dict["Language", PromptConfig] = {
     DA: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="positiv", neutral="neutral", negative="negativ"
@@ -44,6 +67,17 @@ SENT_TEMPLATES = {
         default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
         "documento. Responde con {labels_str}, y nada más.",
     ),
+    ET: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positiivne", neutral="neutraalne", negative="negatiivne"
+        ),
+        default_prompt_prefix="Järgmised on dokumendid ja nende meelestatus, "
+        "mis võib olla {labels_str}.",
+        default_prompt_template="Dokument: {text}\nMeelestatus: {label}",
+        default_instruction_prompt="Dokument: {text}\n\nKlassifitseeri dokument "
+        "meelestatuse järgi. Võimalikud vastused: {labels_str}. Muud vastused "
+        "ei ole lubatud.",
+    ),
     PT: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="positivo", neutral="neutro", negative="negativo"
@@ -104,6 +138,16 @@ SENT_TEMPLATES = {
         default_instruction_prompt="Documento: {text}\n\nClassificare il sentiment del "
         "documento. Rispondere con {labels_str}, e nient'altro.",
     ),
+    LV: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="pozitīvs", neutral="neitrāls", negative="negatīvs"
+        ),
+        default_prompt_prefix="Tālāk ir dokumenti un to noskaņojums, kas var būt "
+        "{labels_str}.",
+        default_prompt_template="Dokuments: {text}\nNoskaņojums: {label}",
+        default_instruction_prompt="Dokuments: {text}\n\nKlasificējiet noskaņojumu "
+        "dokumentā. Atbildiet ar {labels_str}, un neko citu.",
+    ),
     NB: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="positiv", neutral="nøytral", negative="negativ"

EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl

Potentially problematic release.

EuroEval 15.16.0py3-none-any.whl → 16.0.1py3-none-any.whl