PyPI - EuroEval - Versions diffs - 16.2.2__py3-none-any.whl → 16.3.0__py3-none-any.whl - Mend

EuroEval 16.2.2py3-none-any.whl → 16.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (38) hide show

euroeval/__init__.py +4 -2
euroeval/benchmark_modules/fresh.py +3 -1
euroeval/benchmark_modules/hf.py +8 -4
euroeval/benchmark_modules/litellm.py +5 -17
euroeval/benchmark_modules/vllm.py +88 -23
euroeval/benchmarker.py +110 -61
euroeval/cli.py +1 -1
euroeval/constants.py +3 -0
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +0 -2
euroeval/dataset_configs/dutch.py +0 -2
euroeval/dataset_configs/english.py +0 -2
euroeval/dataset_configs/finnish.py +0 -2
euroeval/dataset_configs/french.py +0 -2
euroeval/dataset_configs/german.py +0 -2
euroeval/dataset_configs/italian.py +0 -2
euroeval/dataset_configs/latvian.py +2 -3
euroeval/dataset_configs/lithuanian.py +62 -0
euroeval/dataset_configs/norwegian.py +0 -2
euroeval/dataset_configs/polish.py +0 -2
euroeval/dataset_configs/portuguese.py +0 -2
euroeval/dataset_configs/spanish.py +0 -2
euroeval/dataset_configs/swedish.py +0 -3
euroeval/metrics/huggingface.py +1 -1
euroeval/metrics/pipeline.py +5 -0
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +9 -0
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +10 -0
euroeval/prompt_templates/sentiment_classification.py +11 -0
euroeval/tokenisation_utils.py +8 -8
euroeval/utils.py +1 -1
{euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/METADATA +181 -60
euroeval-16.3.0.dist-info/RECORD +71 -0
euroeval-16.2.2.dist-info/RECORD +0 -70
{euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/WHEEL +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/licenses/LICENSE +0 -0

euroeval/dataset_configs/finnish.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Finnish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import FI
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
@@ -111,7 +110,6 @@ WINOGRANDE_FI_CONFIG = DatasetConfig(
     languages=[FI],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/french.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All French dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import FR
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -123,7 +122,6 @@ WINOGRANDE_FR_CONFIG = DatasetConfig(
     languages=[FR],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/german.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All German dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import DE
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -140,7 +139,6 @@ WINOGRANDE_DE_CONFIG = DatasetConfig(
     languages=[DE],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/italian.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Italian dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import IT
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -131,7 +130,6 @@ WINOGRANDE_IT_CONFIG = DatasetConfig(
     languages=[IT],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/latvian.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Latvian dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import LV
 from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
@@ -25,7 +24,8 @@ SCALA_LV_CONFIG = DatasetConfig(
 FULLSTACK_NER_LV_CONFIG = DatasetConfig(
     name="fullstack-ner-lv",
-    pretty_name="the truncated version of the FullStack NER dataset",
+    pretty_name="the truncated version of the Latvian named entity recognition "
+    "dataset FullStack-NER-lv",
     huggingface_id="EuroEval/fullstack-ner-lv-mini",
     task=NER,
     languages=[LV],
@@ -90,6 +90,5 @@ WINOGRANDE_LV_CONFIG = DatasetConfig(
     languages=[LV],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/lithuanian.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""All Lithuanian dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..languages import LT
+from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
+### Official datasets ###
+LITHUANIAN_EMOTIONS_CONFIG = DatasetConfig(
+    name="lithuanian-emotions",
+    pretty_name="the truncated version of the Lithuanian sentiment "
+    "classification dataset Lithuanian Emotions",
+    huggingface_id="EuroEval/lithuanian-emotions-mini",
+    task=SENT,
+    languages=[LT],
+)
+SCALA_LT_CONFIG = DatasetConfig(
+    name="scala-lt",
+    pretty_name="the Lithuanian part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-lt",
+    task=LA,
+    languages=[LT],
+)
+WIKIANN_LT_CONFIG = DatasetConfig(
+    name="wikiann-lt",
+    pretty_name="the truncated version of the Lithuanian part of the named entity "
+    "recognition dataset WikiANN",
+    huggingface_id="EuroEval/wikiann-lt-mini",
+    task=NER,
+    languages=[LT],
+)
+MULTI_WIKI_QA_LT_CONFIG = DatasetConfig(
+    name="multi-wiki-qa-lt",
+    pretty_name="the truncated version of the Lithuanian part of the reading "
+    "comprehension dataset MultiWikiQA",
+    huggingface_id="EuroEval/multi-wiki-qa-lt-mini",
+    task=RC,
+    languages=[LT],
+)
+LT_HISTORY_CONFIG = DatasetConfig(
+    name="lt-history",
+    pretty_name="the Lithuanian knowledge dataset LT-History",
+    huggingface_id="EuroEval/lt-history",
+    task=KNOW,
+    languages=[LT],
+    splits=["train", "test"],
+)
+WINOGRANDE_LT_CONFIG = DatasetConfig(
+    name="winogrande-lt",
+    pretty_name="the Lithuanian common-sense reasoning dataset Winogrande-lt, "
+    "translated from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-lt",
+    task=COMMON_SENSE,
+    languages=[LT],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+)

euroeval/dataset_configs/norwegian.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Norwegian dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import NB, NN, NO
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -226,7 +225,6 @@ WINOGRANDE_NO_CONFIG = DatasetConfig(
     languages=[NB, NN, NO],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/polish.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Polish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import PL
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
@@ -64,7 +63,6 @@ WINOGRANDE_PL_CONFIG = DatasetConfig(
     languages=[PL],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
 )
 EUROPEAN_VALUES_PL_CONFIG = DatasetConfig(

euroeval/dataset_configs/portuguese.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Portuguese dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import PT
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -101,7 +100,6 @@ WINOGRANDE_PT_CONFIG = DatasetConfig(
     languages=[PT],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/spanish.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Spanish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import ES
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -129,7 +128,6 @@ WINOGRANDE_ES_CONFIG = DatasetConfig(
     languages=[ES],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/swedish.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Swedish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import SV
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -140,7 +139,6 @@ WINOGRANDE_SV_CONFIG = DatasetConfig(
     languages=[SV],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )
@@ -177,6 +175,5 @@ SKOLPROV_CONFIG = DatasetConfig(
     task=KNOW,
     languages=[SV],
     splits=["train", "test"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/metrics/huggingface.py CHANGED Viewed

@@ -197,7 +197,7 @@ bert_score_metric = HuggingFaceMetric(
     huggingface_id="bertscore",
     results_key="f1",
     compute_kwargs=dict(
-        model_type="microsoft/mdeberta-v3-base", device="cpu", batch_size=16
+        model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
     ),
 )

euroeval/metrics/pipeline.py CHANGED Viewed

@@ -191,6 +191,11 @@ def european_values_preprocessing_fn(
             for idx, choice in idx_to_choice.items()
             if choice is not None
         }
+        if prediction not in idx_to_choice:
+            raise InvalidBenchmark(
+                f"The prediction {prediction} is not a valid index for the "
+                f"question with choices {idx_to_choice}."
+            )
         integer_prediction = idx_to_choice[prediction]
         integer_predictions.append(integer_prediction)

euroeval/prompt_templates/linguistic_acceptability.py CHANGED Viewed

@@ -14,6 +14,7 @@ from ..languages import (
     FR,
     IS,
     IT,
+    LT,
     LV,
     NB,
     NL,
@@ -126,6 +127,14 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
         default_instruction_prompt="Frase: {text}\n\nStabilite se la frase è "
         "grammaticalmente corretta o meno. Rispondere con {labels_str}, e nient'altro.",
     ),
+    LT: PromptConfig(
+        default_prompt_label_mapping=dict(correct="taip", incorrect="ne"),
+        default_prompt_prefix="Toliau pateikti sakiniai ir ar jie yra gramatiškai "
+        "teisingi.",
+        default_prompt_template="Sakinys: {text}\nGramatiškai teisingas: {label}",
+        default_instruction_prompt="Sakinys: {text}\n\nNustatykite, ar sakinys yra "
+        "gramatiškai teisingas, ar ne. Atsakykite su {labels_str}, ir nieko kito.",
+    ),
     LV: PromptConfig(
         default_prompt_label_mapping=dict(correct="jā", incorrect="nē"),
         default_prompt_prefix="Šie ir teikumi un to gramatiskie pareizumi.",

euroeval/prompt_templates/multiple_choice.py CHANGED Viewed

@@ -13,6 +13,7 @@ from ..languages import (
     FR,
     IS,
     IT,
+    LT,
     LV,
     NB,
     NL,
@@ -105,6 +106,14 @@ MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
         "precedente con {labels_str}, e nient'altro.",
         default_prompt_label_mapping="auto",
     ),
+    LT: PromptConfig(
+        default_prompt_prefix="Toliau pateikti daugiavariančiai klausimai "
+        "(su atsakymais).",
+        default_prompt_template="Klausimas: {text}\nAtsakymas: {label}",
+        default_instruction_prompt="Klausimas: {text}\n\nAtsakykite į aukščiau "
+        "pateiktą klausimą atsakydami {labels_str}, ir nieko daugiau.",
+        default_prompt_label_mapping="auto",
+    ),
     LV: PromptConfig(
         default_prompt_prefix="Tālāk seko jautājumi ar vairākām atbilžu izvēlēm "
         "(ar atbildēm).",

euroeval/prompt_templates/named_entity_recognition.py CHANGED Viewed

@@ -14,6 +14,7 @@ from ..languages import (
     FR,
     IS,
     IT,
+    LT,
     LV,
     NB,
     NL,
@@ -241,6 +242,25 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
         "{labels_str}. I valori devono essere elenchi di entità "
         "nominate di quel tipo, esattamente come appaiono nella frase.",
     ),
+    LT: PromptConfig(
+        default_prompt_label_mapping={
+            "b-per": "asmuo",
+            "i-per": "asmuo",
+            "b-loc": "vieta",
+            "i-loc": "vieta",
+            "b-org": "organizacija",
+            "i-org": "organizacija",
+            "b-misc": "kita",
+            "i-misc": "kita",
+        },
+        default_prompt_prefix="Toliau pateikti sakiniai ir JSON žodynai su vardiniais "
+        "vienetais, kurie pateikiame sakinyje.",
+        default_prompt_template="Sakinys: {text}\nVardiniai vienetai: {label}",
+        default_instruction_prompt="Sakinys: {text}\n\nIdentifikuokite vardinius "
+        "vienetus sakinyje. Turėtumėte pateikti tai kaip JSON žodyną su raktais "
+        "{labels_str}. Reikšmės turi būti to tipo vardinių vienetų sąrašai, "
+        "tiksliai taip, kaip jie rodomi sakinyje.",
+    ),
     LV: PromptConfig(
         default_prompt_label_mapping={
             "b-per": "persona",

euroeval/prompt_templates/reading_comprehension.py CHANGED Viewed

@@ -14,6 +14,7 @@ from ..languages import (
     FR,
     IS,
     IT,
+    LT,
     LV,
     NB,
     NL,
@@ -116,6 +117,15 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
         "sul in un massimo di 3 parole.\n\nDomanda: {question}",
         default_prompt_label_mapping=dict(),
     ),
+    LT: PromptConfig(
+        default_prompt_prefix="Toliau pateikti tekstai su atitinkamais klausimais ir "
+        "atsakymais.",
+        default_prompt_template="Tekstas: {text}\nKlausimas: {question}\nAtsakykite ne "
+        "daugiau kaip 3 žodžiais: {label}",
+        default_instruction_prompt="Tekstas: {text}\n\nAtsakykite į šį klausimą apie "
+        "aukščiau pateiktą tekstą ne daugiau kaip 3 žodžiais.\n\nKlausimas: {question}",
+        default_prompt_label_mapping=dict(),
+    ),
     LV: PromptConfig(
         default_prompt_prefix="Turpmāk seko teksti ar atbilstošiem jautājumiem un "
         "atbildēm.",

euroeval/prompt_templates/sentiment_classification.py CHANGED Viewed

@@ -14,6 +14,7 @@ from ..languages import (
     FR,
     IS,
     IT,
+    LT,
     LV,
     NB,
     NL,
@@ -153,6 +154,16 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
         default_instruction_prompt="Documento: {text}\n\nClassificare il sentiment del "
         "documento. Rispondere con {labels_str}, e nient'altro.",
     ),
+    LT: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="teigiamas", neutral="neutralus", negative="neigiamas"
+        ),
+        default_prompt_prefix="Toliau pateikti dokumentai ir jų nuotaika, kuri "
+        "gali būti {labels_str}.",
+        default_prompt_template="Dokumentas: {text}\nNuotaika: {label}",
+        default_instruction_prompt="Dokumentas: {text}\n\nKlasifikuokite nuotaiką "
+        "dokumente. Atsakykite su {labels_str}, ir nieko kito.",
+    ),
     LV: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="pozitīvs", neutral="neitrāls", negative="negatīvs"

euroeval/tokenisation_utils.py CHANGED Viewed

@@ -521,7 +521,14 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
     Returns:
         Whether the tokeniser has a chat template.
     """
-    if hasattr(tokeniser, "chat_template"):
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        log_once(
+            "The tokeniser is a Mistral tokeniser, so assuming that the model is "
+            "instruction tuned.",
+            level=logging.DEBUG,
+        )
+        return True
+    elif hasattr(tokeniser, "chat_template"):
         has_template = tokeniser.chat_template is not None
         if has_template:
             log_once(
@@ -530,13 +537,6 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
                 level=logging.DEBUG,
             )
         return has_template
-    elif isinstance(tokeniser, MistralCommonTokenizer):
-        log_once(
-            "The tokeniser is a Mistral tokeniser, so assuming that the model is "
-            "instruction tuned.",
-            level=logging.DEBUG,
-        )
-        return True
     else:
         log_once(
             "We cannot find a chat template for the tokeniser, so assuming that the "

euroeval/utils.py CHANGED Viewed

@@ -462,7 +462,7 @@ def extract_json_dict_from_string(s: str) -> dict | None:
     Returns:
         The extracted JSON dictionary, or None if no JSON dictionary could be found.
     """
-    json_regex = r"\{[^{}]+?\}"
+    json_regex = r"\{[^{}]*?\}"
     if (json_match := re.search(pattern=json_regex, string=s, flags=re.DOTALL)) is None:
         logger.debug(
             "The model output does not contain any JSON dictionary, so cannot parse "

EuroEval 16.2.2__py3-none-any.whl → 16.3.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.2.2py3-none-any.whl → 16.3.0py3-none-any.whl