PyPI - EuroEval - Versions diffs - 16.2.1__py3-none-any.whl → 16.3.0__py3-none-any.whl - Mend

EuroEval 16.2.1py3-none-any.whl → 16.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (39) hide show

euroeval/__init__.py +4 -2
euroeval/benchmark_modules/fresh.py +3 -1
euroeval/benchmark_modules/hf.py +8 -4
euroeval/benchmark_modules/litellm.py +5 -17
euroeval/benchmark_modules/vllm.py +98 -30
euroeval/benchmarker.py +291 -405
euroeval/cli.py +1 -1
euroeval/constants.py +3 -0
euroeval/data_models.py +35 -35
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +0 -2
euroeval/dataset_configs/dutch.py +0 -2
euroeval/dataset_configs/english.py +0 -2
euroeval/dataset_configs/finnish.py +0 -2
euroeval/dataset_configs/french.py +0 -2
euroeval/dataset_configs/german.py +0 -2
euroeval/dataset_configs/italian.py +0 -2
euroeval/dataset_configs/latvian.py +2 -3
euroeval/dataset_configs/lithuanian.py +62 -0
euroeval/dataset_configs/norwegian.py +0 -2
euroeval/dataset_configs/polish.py +0 -2
euroeval/dataset_configs/portuguese.py +0 -2
euroeval/dataset_configs/spanish.py +0 -2
euroeval/dataset_configs/swedish.py +0 -3
euroeval/metrics/huggingface.py +1 -1
euroeval/metrics/pipeline.py +5 -0
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +9 -0
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +10 -0
euroeval/prompt_templates/sentiment_classification.py +11 -0
euroeval/tokenisation_utils.py +8 -8
euroeval/utils.py +10 -5
{euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/METADATA +181 -60
euroeval-16.3.0.dist-info/RECORD +71 -0
euroeval-16.2.1.dist-info/RECORD +0 -70
{euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/WHEEL +0 -0
{euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/licenses/LICENSE +0 -0

euroeval/cli.py CHANGED Viewed

@@ -188,7 +188,7 @@ from .tasks import get_all_tasks
 )
 @click.option(
     "--gpu-memory-utilization",
-    default=0.9,
+    default=0.8,
     show_default=True,
     help="The GPU memory utilization to use for vLLM. A larger value will result in "
     "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "

euroeval/constants.py CHANGED Viewed

@@ -50,9 +50,11 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
 # Hugging Face Hub tags used to classify models as merge models
 MERGE_TAGS = ["merge", "mergekit"]
 # The minimum required CUDA compute capability for using bfloat16 in vLLM
 VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
 # Used to detect whether a model is a reasoning model
 REASONING_TOKENS = [
     ("<think>", "</think>"),
@@ -60,6 +62,7 @@ REASONING_TOKENS = [
     ("<reasoning>", "</reasoning>"),
 ]
 # These tokens are sometimes used by models to indicate the end of a generated
 # response, but they do not use them as a proper EOS token, so we have to deal with them
 # manually. We only use them as stop tokens if they actually appear in the model's

euroeval/data_models.py CHANGED Viewed

@@ -170,14 +170,16 @@ class BenchmarkConfig:
     """General benchmarking configuration, across datasets and models.
     Attributes:
-        model_languages:
-            The languages of the models to benchmark.
-        dataset_languages:
-            The languages of the datasets in the benchmark.
         tasks:
             The tasks benchmark the model(s) on.
         datasets:
             The datasets to benchmark on.
+        model_languages:
+            The languages of the models to benchmark.
+        dataset_languages:
+            The languages of the datasets in the benchmark.
+        device:
+            The device to use for benchmarking.
         batch_size:
             The batch size to use.
         raise_errors:
@@ -186,17 +188,16 @@ class BenchmarkConfig:
             Directory to store cached models and datasets.
         api_key:
             The API key to use for a given inference API.
-        force:
-            Whether to force the benchmark to run even if the results are already
-            cached.
+        api_base:
+            The base URL for a given inference API. Only relevant if `model` refers to a
+            model on an inference API.
+        api_version:
+            The version of the API to use. Only relevant if `model` refers to a model on
+            an inference API.
         progress_bar:
             Whether to show a progress bar.
         save_results:
             Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
-        device:
-            The device to use for benchmarking.
-        verbose:
-            Whether to print verbose output.
         trust_remote_code:
             Whether to trust remote code when loading models from the Hugging Face Hub.
         clear_model_cache:
@@ -208,21 +209,11 @@ class BenchmarkConfig:
             if the model is generative.
         num_iterations:
             The number of iterations each model should be evaluated for.
-        api_base:
-            The base URL for a given inference API. Only relevant if `model` refers to a
-            model on an inference API.
-        api_version:
-            The version of the API to use. Only relevant if `model` refers to a model on
-            an inference API.
         gpu_memory_utilization:
             The GPU memory utilization to use for vLLM. A larger value will result in
             faster evaluation, but at the risk of running out of GPU memory. Only reduce
             this if you are running out of GPU memory. Only relevant if the model is
             generative.
-        debug:
-            Whether to run the benchmark in debug mode.
-        run_with_cli:
-            Whether the benchmark is being run with the CLI.
         requires_safetensors:
             Whether to only allow models that use the safetensors format.
         generative_type:
@@ -231,6 +222,15 @@ class BenchmarkConfig:
         download_only:
             Whether to only download the models, metrics and datasets without
             evaluating.
+        force:
+            Whether to force the benchmark to run even if the results are already
+            cached.
+        verbose:
+            Whether to print verbose output.
+        debug:
+            Whether to run the benchmark in debug mode.
+        run_with_cli:
+            Whether the benchmark is being run with the CLI.
     """
     model_languages: list[Language]
@@ -241,24 +241,24 @@ class BenchmarkConfig:
     raise_errors: bool
     cache_dir: str
     api_key: str | None
-    force: bool
+    api_base: str | None
+    api_version: str | None
     progress_bar: bool
     save_results: bool
     device: torch.device
-    verbose: bool
     trust_remote_code: bool
     clear_model_cache: bool
     evaluate_test_split: bool
     few_shot: bool
     num_iterations: int
-    api_base: str | None
-    api_version: str | None
     gpu_memory_utilization: float
-    debug: bool
-    run_with_cli: bool
     requires_safetensors: bool
     generative_type: GenerativeType | None
     download_only: bool
+    force: bool
+    verbose: bool
+    debug: bool
+    run_with_cli: bool
 class BenchmarkConfigParams(pydantic.BaseModel):
@@ -266,10 +266,10 @@ class BenchmarkConfigParams(pydantic.BaseModel):
     model_config = pydantic.ConfigDict(protected_namespaces=())
-    progress_bar: bool
-    save_results: bool
     task: str | list[str] | None
     dataset: str | list[str] | None
+    progress_bar: bool
+    save_results: bool
     language: str | list[str]
     model_language: str | list[str] | None
     dataset_language: str | list[str] | None
@@ -278,21 +278,21 @@ class BenchmarkConfigParams(pydantic.BaseModel):
     raise_errors: bool
     cache_dir: str
     api_key: str | None
-    force: bool
-    verbose: bool
+    api_base: str | None
+    api_version: str | None
     trust_remote_code: bool
     clear_model_cache: bool
     evaluate_test_split: bool
     few_shot: bool
     num_iterations: int
-    api_base: str | None
-    api_version: str | None
+    requires_safetensors: bool
+    download_only: bool
     gpu_memory_utilization: float
     generative_type: GenerativeType | None
-    download_only: bool
+    force: bool
+    verbose: bool
     debug: bool
     run_with_cli: bool
-    requires_safetensors: bool
 class BenchmarkResult(pydantic.BaseModel):

euroeval/dataset_configs/__init__.py CHANGED Viewed

@@ -14,6 +14,7 @@ from .german import *  # noqa: F403
 from .icelandic import *  # noqa: F403
 from .italian import *  # noqa: F403
 from .latvian import *  # noqa: F403
+from .lithuanian import *  # noqa: F403
 from .norwegian import *  # noqa: F403
 from .polish import *  # noqa: F403
 from .portuguese import *  # noqa: F403

euroeval/dataset_configs/danish.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Danish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import DA
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -159,7 +158,6 @@ WINOGRANDE_DA_CONFIG = DatasetConfig(
     languages=[DA],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/dutch.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Dutch dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import NL
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -152,7 +151,6 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
     languages=[NL],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/english.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All English dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import EN
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -135,7 +134,6 @@ WINOGRANDE_CONFIG = DatasetConfig(
     languages=[EN],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/finnish.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Finnish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import FI
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
@@ -111,7 +110,6 @@ WINOGRANDE_FI_CONFIG = DatasetConfig(
     languages=[FI],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/french.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All French dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import FR
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -123,7 +122,6 @@ WINOGRANDE_FR_CONFIG = DatasetConfig(
     languages=[FR],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/german.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All German dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import DE
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -140,7 +139,6 @@ WINOGRANDE_DE_CONFIG = DatasetConfig(
     languages=[DE],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/italian.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Italian dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import IT
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -131,7 +130,6 @@ WINOGRANDE_IT_CONFIG = DatasetConfig(
     languages=[IT],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/latvian.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Latvian dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import LV
 from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
@@ -25,7 +24,8 @@ SCALA_LV_CONFIG = DatasetConfig(
 FULLSTACK_NER_LV_CONFIG = DatasetConfig(
     name="fullstack-ner-lv",
-    pretty_name="the truncated version of the FullStack NER dataset",
+    pretty_name="the truncated version of the Latvian named entity recognition "
+    "dataset FullStack-NER-lv",
     huggingface_id="EuroEval/fullstack-ner-lv-mini",
     task=NER,
     languages=[LV],
@@ -90,6 +90,5 @@ WINOGRANDE_LV_CONFIG = DatasetConfig(
     languages=[LV],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/lithuanian.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""All Lithuanian dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..languages import LT
+from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
+### Official datasets ###
+LITHUANIAN_EMOTIONS_CONFIG = DatasetConfig(
+    name="lithuanian-emotions",
+    pretty_name="the truncated version of the Lithuanian sentiment "
+    "classification dataset Lithuanian Emotions",
+    huggingface_id="EuroEval/lithuanian-emotions-mini",
+    task=SENT,
+    languages=[LT],
+)
+SCALA_LT_CONFIG = DatasetConfig(
+    name="scala-lt",
+    pretty_name="the Lithuanian part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-lt",
+    task=LA,
+    languages=[LT],
+)
+WIKIANN_LT_CONFIG = DatasetConfig(
+    name="wikiann-lt",
+    pretty_name="the truncated version of the Lithuanian part of the named entity "
+    "recognition dataset WikiANN",
+    huggingface_id="EuroEval/wikiann-lt-mini",
+    task=NER,
+    languages=[LT],
+)
+MULTI_WIKI_QA_LT_CONFIG = DatasetConfig(
+    name="multi-wiki-qa-lt",
+    pretty_name="the truncated version of the Lithuanian part of the reading "
+    "comprehension dataset MultiWikiQA",
+    huggingface_id="EuroEval/multi-wiki-qa-lt-mini",
+    task=RC,
+    languages=[LT],
+)
+LT_HISTORY_CONFIG = DatasetConfig(
+    name="lt-history",
+    pretty_name="the Lithuanian knowledge dataset LT-History",
+    huggingface_id="EuroEval/lt-history",
+    task=KNOW,
+    languages=[LT],
+    splits=["train", "test"],
+)
+WINOGRANDE_LT_CONFIG = DatasetConfig(
+    name="winogrande-lt",
+    pretty_name="the Lithuanian common-sense reasoning dataset Winogrande-lt, "
+    "translated from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-lt",
+    task=COMMON_SENSE,
+    languages=[LT],
+    splits=["train", "test"],
+    _labels=["a", "b"],
+)

euroeval/dataset_configs/norwegian.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Norwegian dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import NB, NN, NO
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -226,7 +225,6 @@ WINOGRANDE_NO_CONFIG = DatasetConfig(
     languages=[NB, NN, NO],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/polish.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Polish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import PL
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
@@ -64,7 +63,6 @@ WINOGRANDE_PL_CONFIG = DatasetConfig(
     languages=[PL],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
 )
 EUROPEAN_VALUES_PL_CONFIG = DatasetConfig(

euroeval/dataset_configs/portuguese.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Portuguese dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import PT
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -101,7 +100,6 @@ WINOGRANDE_PT_CONFIG = DatasetConfig(
     languages=[PT],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/spanish.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Spanish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import ES
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -129,7 +128,6 @@ WINOGRANDE_ES_CONFIG = DatasetConfig(
     languages=[ES],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/swedish.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Swedish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import SV
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -140,7 +139,6 @@ WINOGRANDE_SV_CONFIG = DatasetConfig(
     languages=[SV],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )
@@ -177,6 +175,5 @@ SKOLPROV_CONFIG = DatasetConfig(
     task=KNOW,
     languages=[SV],
     splits=["train", "test"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/metrics/huggingface.py CHANGED Viewed

@@ -197,7 +197,7 @@ bert_score_metric = HuggingFaceMetric(
     huggingface_id="bertscore",
     results_key="f1",
     compute_kwargs=dict(
-        model_type="microsoft/mdeberta-v3-base", device="cpu", batch_size=16
+        model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
     ),
 )

euroeval/metrics/pipeline.py CHANGED Viewed

@@ -191,6 +191,11 @@ def european_values_preprocessing_fn(
             for idx, choice in idx_to_choice.items()
             if choice is not None
         }
+        if prediction not in idx_to_choice:
+            raise InvalidBenchmark(
+                f"The prediction {prediction} is not a valid index for the "
+                f"question with choices {idx_to_choice}."
+            )
         integer_prediction = idx_to_choice[prediction]
         integer_predictions.append(integer_prediction)

euroeval/prompt_templates/linguistic_acceptability.py CHANGED Viewed

@@ -14,6 +14,7 @@ from ..languages import (
     FR,
     IS,
     IT,
+    LT,
     LV,
     NB,
     NL,
@@ -126,6 +127,14 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
         default_instruction_prompt="Frase: {text}\n\nStabilite se la frase è "
         "grammaticalmente corretta o meno. Rispondere con {labels_str}, e nient'altro.",
     ),
+    LT: PromptConfig(
+        default_prompt_label_mapping=dict(correct="taip", incorrect="ne"),
+        default_prompt_prefix="Toliau pateikti sakiniai ir ar jie yra gramatiškai "
+        "teisingi.",
+        default_prompt_template="Sakinys: {text}\nGramatiškai teisingas: {label}",
+        default_instruction_prompt="Sakinys: {text}\n\nNustatykite, ar sakinys yra "
+        "gramatiškai teisingas, ar ne. Atsakykite su {labels_str}, ir nieko kito.",
+    ),
     LV: PromptConfig(
         default_prompt_label_mapping=dict(correct="jā", incorrect="nē"),
         default_prompt_prefix="Šie ir teikumi un to gramatiskie pareizumi.",

euroeval/prompt_templates/multiple_choice.py CHANGED Viewed

@@ -13,6 +13,7 @@ from ..languages import (
     FR,
     IS,
     IT,
+    LT,
     LV,
     NB,
     NL,
@@ -105,6 +106,14 @@ MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
         "precedente con {labels_str}, e nient'altro.",
         default_prompt_label_mapping="auto",
     ),
+    LT: PromptConfig(
+        default_prompt_prefix="Toliau pateikti daugiavariančiai klausimai "
+        "(su atsakymais).",
+        default_prompt_template="Klausimas: {text}\nAtsakymas: {label}",
+        default_instruction_prompt="Klausimas: {text}\n\nAtsakykite į aukščiau "
+        "pateiktą klausimą atsakydami {labels_str}, ir nieko daugiau.",
+        default_prompt_label_mapping="auto",
+    ),
     LV: PromptConfig(
         default_prompt_prefix="Tālāk seko jautājumi ar vairākām atbilžu izvēlēm "
         "(ar atbildēm).",

euroeval/prompt_templates/named_entity_recognition.py CHANGED Viewed

@@ -14,6 +14,7 @@ from ..languages import (
     FR,
     IS,
     IT,
+    LT,
     LV,
     NB,
     NL,
@@ -241,6 +242,25 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
         "{labels_str}. I valori devono essere elenchi di entità "
         "nominate di quel tipo, esattamente come appaiono nella frase.",
     ),
+    LT: PromptConfig(
+        default_prompt_label_mapping={
+            "b-per": "asmuo",
+            "i-per": "asmuo",
+            "b-loc": "vieta",
+            "i-loc": "vieta",
+            "b-org": "organizacija",
+            "i-org": "organizacija",
+            "b-misc": "kita",
+            "i-misc": "kita",
+        },
+        default_prompt_prefix="Toliau pateikti sakiniai ir JSON žodynai su vardiniais "
+        "vienetais, kurie pateikiame sakinyje.",
+        default_prompt_template="Sakinys: {text}\nVardiniai vienetai: {label}",
+        default_instruction_prompt="Sakinys: {text}\n\nIdentifikuokite vardinius "
+        "vienetus sakinyje. Turėtumėte pateikti tai kaip JSON žodyną su raktais "
+        "{labels_str}. Reikšmės turi būti to tipo vardinių vienetų sąrašai, "
+        "tiksliai taip, kaip jie rodomi sakinyje.",
+    ),
     LV: PromptConfig(
         default_prompt_label_mapping={
             "b-per": "persona",

euroeval/prompt_templates/reading_comprehension.py CHANGED Viewed

@@ -14,6 +14,7 @@ from ..languages import (
     FR,
     IS,
     IT,
+    LT,
     LV,
     NB,
     NL,
@@ -116,6 +117,15 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
         "sul in un massimo di 3 parole.\n\nDomanda: {question}",
         default_prompt_label_mapping=dict(),
     ),
+    LT: PromptConfig(
+        default_prompt_prefix="Toliau pateikti tekstai su atitinkamais klausimais ir "
+        "atsakymais.",
+        default_prompt_template="Tekstas: {text}\nKlausimas: {question}\nAtsakykite ne "
+        "daugiau kaip 3 žodžiais: {label}",
+        default_instruction_prompt="Tekstas: {text}\n\nAtsakykite į šį klausimą apie "
+        "aukščiau pateiktą tekstą ne daugiau kaip 3 žodžiais.\n\nKlausimas: {question}",
+        default_prompt_label_mapping=dict(),
+    ),
     LV: PromptConfig(
         default_prompt_prefix="Turpmāk seko teksti ar atbilstošiem jautājumiem un "
         "atbildēm.",

euroeval/prompt_templates/sentiment_classification.py CHANGED Viewed

@@ -14,6 +14,7 @@ from ..languages import (
     FR,
     IS,
     IT,
+    LT,
     LV,
     NB,
     NL,
@@ -153,6 +154,16 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
         default_instruction_prompt="Documento: {text}\n\nClassificare il sentiment del "
         "documento. Rispondere con {labels_str}, e nient'altro.",
     ),
+    LT: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="teigiamas", neutral="neutralus", negative="neigiamas"
+        ),
+        default_prompt_prefix="Toliau pateikti dokumentai ir jų nuotaika, kuri "
+        "gali būti {labels_str}.",
+        default_prompt_template="Dokumentas: {text}\nNuotaika: {label}",
+        default_instruction_prompt="Dokumentas: {text}\n\nKlasifikuokite nuotaiką "
+        "dokumente. Atsakykite su {labels_str}, ir nieko kito.",
+    ),
     LV: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="pozitīvs", neutral="neitrāls", negative="negatīvs"

euroeval/tokenisation_utils.py CHANGED Viewed

@@ -521,7 +521,14 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
     Returns:
         Whether the tokeniser has a chat template.
     """
-    if hasattr(tokeniser, "chat_template"):
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        log_once(
+            "The tokeniser is a Mistral tokeniser, so assuming that the model is "
+            "instruction tuned.",
+            level=logging.DEBUG,
+        )
+        return True
+    elif hasattr(tokeniser, "chat_template"):
         has_template = tokeniser.chat_template is not None
         if has_template:
             log_once(
@@ -530,13 +537,6 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
                 level=logging.DEBUG,
             )
         return has_template
-    elif isinstance(tokeniser, MistralCommonTokenizer):
-        log_once(
-            "The tokeniser is a Mistral tokeniser, so assuming that the model is "
-            "instruction tuned.",
-            level=logging.DEBUG,
-        )
-        return True
     else:
         log_once(
             "We cannot find a chat template for the tokeniser, so assuming that the "

EuroEval 16.2.1__py3-none-any.whl → 16.3.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.2.1py3-none-any.whl → 16.3.0py3-none-any.whl