PyPI - EuroEval - Versions diffs - 15.5.0__py3-none-any.whl → 15.6.1__py3-none-any.whl - Mend

EuroEval 15.5.0py3-none-any.whl → 15.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (53) hide show

euroeval/benchmark_modules/base.py +3 -2
euroeval/benchmark_modules/fresh.py +8 -6
euroeval/benchmark_modules/hf.py +33 -31
euroeval/benchmark_modules/litellm.py +120 -56
euroeval/benchmark_modules/vllm.py +41 -26
euroeval/benchmarker.py +23 -21
euroeval/callbacks.py +2 -2
euroeval/constants.py +1 -1
euroeval/data_models.py +261 -42
euroeval/dataset_configs/__init__.py +61 -0
euroeval/dataset_configs/danish.py +120 -0
euroeval/dataset_configs/dutch.py +123 -0
euroeval/dataset_configs/english.py +88 -0
euroeval/dataset_configs/faroese.py +54 -0
euroeval/dataset_configs/french.py +83 -0
euroeval/dataset_configs/german.py +91 -0
euroeval/dataset_configs/icelandic.py +148 -0
euroeval/dataset_configs/italian.py +81 -0
euroeval/dataset_configs/norwegian.py +178 -0
euroeval/dataset_configs/spanish.py +78 -0
euroeval/dataset_configs/swedish.py +100 -0
euroeval/exceptions.py +10 -10
euroeval/finetuning.py +6 -10
euroeval/generation.py +1 -0
euroeval/human_evaluation.py +2 -2
euroeval/languages.py +20 -13
euroeval/model_cache.py +1 -1
euroeval/model_loading.py +1 -12
euroeval/prompt_templates/__init__.py +8 -0
euroeval/prompt_templates/linguistic_acceptability.py +112 -0
euroeval/prompt_templates/multiple_choice.py +97 -0
euroeval/prompt_templates/named_entity_recognition.py +257 -0
euroeval/prompt_templates/reading_comprehension.py +118 -0
euroeval/prompt_templates/sentiment_classification.py +137 -0
euroeval/prompt_templates/summarization.py +97 -0
euroeval/speed_benchmark.py +1 -1
euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
euroeval/{task_utils → task_group_utils}/sequence_classification.py +1 -1
euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
euroeval/tasks.py +54 -0
euroeval/tokenization_utils.py +343 -0
euroeval/types.py +3 -1
euroeval/utils.py +2 -347
{euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/METADATA +31 -9
euroeval-15.6.1.dist-info/RECORD +59 -0
euroeval/dataset_configs.py +0 -2408
euroeval-15.5.0.dist-info/RECORD +0 -40
/euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
{euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/WHEEL +0 -0
{euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/entry_points.txt +0 -0
{euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/licenses/LICENSE +0 -0

euroeval/dataset_configs/norwegian.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""All Norwegian dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..languages import NB, NN, NO
+from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
+### Official datasets ###
+NOREC_CONFIG = DatasetConfig(
+    name="norec",
+    pretty_name="the truncated version of the Norwegian sentiment classification "
+    "dataset NoReC",
+    huggingface_id="EuroEval/norec-mini",
+    task=SENT,
+    languages=[NB, NN, NO],
+)
+SCALA_NB_CONFIG = DatasetConfig(
+    name="scala-nb",
+    pretty_name="the Bokmål part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-nb",
+    task=LA,
+    languages=[NB, NO],
+)
+SCALA_NN_CONFIG = DatasetConfig(
+    name="scala-nn",
+    pretty_name="the Nynorsk part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-nn",
+    task=LA,
+    languages=[NN],
+)
+NORNE_NB_CONFIG = DatasetConfig(
+    name="norne-nb",
+    pretty_name="the truncated version of the Bokmål part of the Norwegian named "
+    "entity recognition dataset NorNE",
+    huggingface_id="EuroEval/norne-nb-mini",
+    task=NER,
+    languages=[NB, NO],
+)
+NORNE_NN_CONFIG = DatasetConfig(
+    name="norne-nn",
+    pretty_name="the truncated version of the Nynorsk part of the Norwegian named "
+    "entity recognition dataset NorNE",
+    huggingface_id="EuroEval/norne-nn-mini",
+    task=NER,
+    languages=[NN],
+)
+NORQUAD_CONFIG = DatasetConfig(
+    name="norquad",
+    pretty_name="the truncated version of the Norwegian question answering "
+    "dataset NorQuAD",
+    huggingface_id="EuroEval/norquad-mini",
+    task=RC,
+    languages=[NB, NN, NO],
+    _num_few_shot_examples=2,
+)
+NO_SAMMENDRAG_CONFIG = DatasetConfig(
+    name="no-sammendrag",
+    pretty_name="the truncated version of the Norwegian summarisation dataset "
+    "Norske Sammendrag",
+    huggingface_id="EuroEval/no-sammendrag-mini",
+    task=SUMM,
+    languages=[NB, NN, NO],
+)
+NRK_QUIZ_QA_CONFIG = DatasetConfig(
+    name="nrk-quiz-qa",
+    pretty_name="the truncated version of the Norwegian knowledge dataset NRK Quiz QA",
+    huggingface_id="EuroEval/nrk-quiz-qa-mini",
+    task=KNOW,
+    languages=[NB, NN, NO],
+)
+NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
+    name="nor-common-sense-qa",
+    pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
+    "NorCommonSenseQA",
+    huggingface_id="EuroEval/nor-common-sense-qa",
+    task=COMMON_SENSE,
+    languages=[NB, NN, NO],
+    _labels=["a", "b", "c", "d", "e"],
+)
+### Unofficial datasets ###
+NO_COLA_CONFIG = DatasetConfig(
+    name="no-cola",
+    pretty_name="the truncated version of the Norwegian linguistic acceptability "
+    "dataset NoCoLA",
+    huggingface_id="EuroEval/no-cola-mini",
+    task=LA,
+    languages=[NB, NO],
+    unofficial=True,
+)
+NORGLM_MULTI_QA = DatasetConfig(
+    name="norglm-multi-qa",
+    pretty_name="the question answering part of the Norwegian NorGLM multi-task human "
+    "annotated dataset NO-Multi-QA-Sum",
+    huggingface_id="EuroEval/norglm-multi-qa",
+    task=RC,
+    languages=[NB, NN, NO],
+    unofficial=True,
+)
+NORGLM_MULTI_SUM = DatasetConfig(
+    name="norglm-multi-sum",
+    pretty_name="the summarisation part of the Norwegian NorGLM multi-task human "
+    "annotated dataset NO-Multi-QA-Sum",
+    huggingface_id="EuroEval/norglm-multi-sum",
+    task=SUMM,
+    languages=[NB, NN, NO],
+    unofficial=True,
+)
+SCHIBSTED_NO_CONFIG = DatasetConfig(
+    name="schibsted-no",
+    pretty_name="the Norwegian summarisation dataset Schibsted-no",
+    huggingface_id="EuroEval/schibsted-article-summaries-no",
+    task=SUMM,
+    languages=[NB, NN, NO],
+    unofficial=True,
+)
+PERSONAL_SUM_CONFIG = DatasetConfig(
+    name="personal-sum",
+    pretty_name="the Norwegian summarisation dataset personal-sum",
+    huggingface_id="EuroEval/personal-sum",
+    task=SUMM,
+    languages=[NB, NN, NO],
+    unofficial=True,
+)
+MMLU_NO_CONFIG = DatasetConfig(
+    name="mmlu-no",
+    pretty_name="the truncated version of the Norwegian knowledge dataset MMLU-no, "
+    "translated from the English MMLU dataset",
+    huggingface_id="EuroEval/mmlu-no-mini",
+    task=KNOW,
+    languages=[NB, NN, NO],
+    unofficial=True,
+)
+ARC_NO_CONFIG = DatasetConfig(
+    name="arc-no",
+    pretty_name="the truncated version of the Norwegian knowledge dataset ARC-no, "
+    "translated from the English ARC dataset",
+    huggingface_id="EuroEval/arc-no-mini",
+    task=KNOW,
+    languages=[NB, NN, NO],
+    unofficial=True,
+)
+HELLASWAG_NO_CONFIG = DatasetConfig(
+    name="hellaswag-no",
+    pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
+    "HellaSwag-no, translated from the English HellaSwag dataset",
+    huggingface_id="EuroEval/hellaswag-no-mini",
+    task=COMMON_SENSE,
+    languages=[NB, NN, NO],
+    unofficial=True,
+)
+BELEBELE_NO_CONFIG = DatasetConfig(
+    name="belebele-no",
+    pretty_name="the Norwegian multiple choice reading comprehension dataset "
+    "BeleBele-no, translated from the English BeleBele dataset",
+    huggingface_id="EuroEval/belebele-no-mini",
+    task=MCRC,
+    languages=[NB, NN, NO],
+    unofficial=True,
+)

euroeval/dataset_configs/spanish.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""All Spanish dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..languages import ES
+from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
+### Official datasets ###
+SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
+    name="sentiment-headlines-es",
+    pretty_name="the truncated version of the Spanish sentiment headlines dataset",
+    huggingface_id="EuroEval/sentiment-headlines-es",
+    task=SENT,
+    languages=[ES],
+)
+SCALA_ES_CONFIG = DatasetConfig(
+    name="scala-es",
+    pretty_name="the Spanish part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-es",
+    task=LA,
+    languages=[ES],
+)
+CONLL_ES_CONFIG = DatasetConfig(
+    name="conll-es",
+    pretty_name="the Spanish part of the truncated version of the named entity "
+    "recognition dataset CoNLL 2002",
+    huggingface_id="EuroEval/conll-es-mini",
+    task=NER,
+    languages=[ES],
+)
+MLQA_ES_CONFIG = DatasetConfig(
+    name="mlqa-es",
+    pretty_name="the Spanish version of the MLQA reading comprehension dataset",
+    huggingface_id="EuroEval/mlqa-es",
+    task=RC,
+    languages=[ES],
+)
+MLSUM_ES_CONFIG = DatasetConfig(
+    name="mlsum-es",
+    pretty_name="the truncated version of the Spanish summarisation dataset MLSum-es",
+    huggingface_id="EuroEval/mlsum-es-mini",
+    task=SUMM,
+    languages=[ES],
+)
+MMLU_ES_CONFIG = DatasetConfig(
+    name="mmlu-es",
+    pretty_name="the truncated version of the Spanish knowledge dataset MMLU-es, "
+    "translated from the English MMLU dataset",
+    huggingface_id="EuroEval/mmlu-es-mini",
+    task=KNOW,
+    languages=[ES],
+)
+HELLASWAG_ES_CONFIG = DatasetConfig(
+    name="hellaswag-es",
+    pretty_name="the truncated version of the Spanish common-sense reasoning dataset "
+    "HellaSwag-es, translated from the English HellaSwag dataset",
+    huggingface_id="EuroEval/hellaswag-es-mini",
+    task=COMMON_SENSE,
+    languages=[ES],
+)
+### Unofficial datasets ###
+XQUAD_ES_CONFIG = DatasetConfig(
+    name="xquad-es",
+    pretty_name="the Spanish version of the XQuAD reading comprehension dataset",
+    huggingface_id="EuroEval/xquad-es",
+    task=RC,
+    languages=[ES],
+    unofficial=True,
+)

euroeval/dataset_configs/swedish.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""All Swedish dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..languages import SV
+from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
+### Official datasets ###
+SWEREC_CONFIG = DatasetConfig(
+    name="swerec",
+    pretty_name="the truncated version of the Swedish sentiment classification "
+    "dataset SweReC",
+    huggingface_id="EuroEval/swerec-mini",
+    task=SENT,
+    languages=[SV],
+)
+SCALA_SV_CONFIG = DatasetConfig(
+    name="scala-sv",
+    pretty_name="The Swedish part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-sv",
+    task=LA,
+    languages=[SV],
+)
+SUC3_CONFIG = DatasetConfig(
+    name="suc3",
+    pretty_name="the truncated version of the Swedish named entity recognition "
+    "dataset SUC 3.0",
+    huggingface_id="EuroEval/suc3-mini",
+    task=NER,
+    languages=[SV],
+)
+SCANDIQA_SV_CONFIG = DatasetConfig(
+    name="scandiqa-sv",
+    pretty_name="the Swedish part of the truncated version of the question answering "
+    "dataset ScandiQA",
+    huggingface_id="EuroEval/scandiqa-sv-mini",
+    task=RC,
+    languages=[SV],
+)
+SWEDN_CONFIG = DatasetConfig(
+    name="swedn",
+    pretty_name="the truncated version of the Swedish summarisation dataset SweDN",
+    huggingface_id="EuroEval/swedn-mini",
+    task=SUMM,
+    languages=[SV],
+)
+MMLU_SV_CONFIG = DatasetConfig(
+    name="mmlu-sv",
+    pretty_name="the truncated version of the Swedish knowledge dataset MMLU-sv, "
+    "translated from the English MMLU dataset",
+    huggingface_id="EuroEval/mmlu-sv-mini",
+    task=KNOW,
+    languages=[SV],
+)
+HELLASWAG_SV_CONFIG = DatasetConfig(
+    name="hellaswag-sv",
+    pretty_name="the truncated version of the Swedish common-sense reasoning dataset "
+    "HellaSwag-sv, translated from the English HellaSwag dataset",
+    huggingface_id="EuroEval/hellaswag-sv-mini",
+    task=COMMON_SENSE,
+    languages=[SV],
+)
+### Unofficial datasets ###
+SCHIBSTED_SV_CONFIG = DatasetConfig(
+    name="schibsted-sv",
+    pretty_name="the Swedish summarisation dataset Schibsted-sv",
+    huggingface_id="EuroEval/schibsted-article-summaries-sv",
+    task=SUMM,
+    languages=[SV],
+    unofficial=True,
+)
+ARC_SV_CONFIG = DatasetConfig(
+    name="arc-sv",
+    pretty_name="the truncated version of the Swedish knowledge dataset ARC-sv, "
+    "translated from the English ARC dataset",
+    huggingface_id="EuroEval/arc-sv-mini",
+    task=KNOW,
+    languages=[SV],
+    unofficial=True,
+)
+BELEBELE_SV_CONFIG = DatasetConfig(
+    name="belebele-sv",
+    pretty_name="the Swedish multiple choice reading comprehension dataset "
+    "BeleBele-sv, translated from the English BeleBele dataset",
+    huggingface_id="EuroEval/belebele-sv-mini",
+    task=MCRC,
+    languages=[SV],
+    unofficial=True,
+)

euroeval/exceptions.py CHANGED Viewed

@@ -7,7 +7,7 @@ class InvalidBenchmark(Exception):
     def __init__(
         self, message: str = "This model cannot be benchmarked on the given dataset."
     ) -> None:
-        """Initialize the exception.
+        """Initialise the exception.
         Args:
             message:
@@ -23,7 +23,7 @@ class InvalidModel(Exception):
     def __init__(
         self, message: str = "The model cannot be benchmarked on any datasets."
     ) -> None:
-        """Initialize the exception.
+        """Initialise the exception.
         Args:
             message:
@@ -39,7 +39,7 @@ class HuggingFaceHubDown(Exception):
     def __init__(
         self, message: str = "The Hugging Face Hub is currently down."
     ) -> None:
-        """Initialize the exception.
+        """Initialise the exception.
         Args:
             message:
@@ -55,7 +55,7 @@ class NoInternetConnection(Exception):
     def __init__(
         self, message: str = "There is currently no internet connection."
     ) -> None:
-        """Initialize the exception.
+        """Initialise the exception.
         Args:
             message:
@@ -71,7 +71,7 @@ class NaNValueInModelOutput(Exception):
     def __init__(
         self, message: str = "There is a NaN value in the model output."
     ) -> None:
-        """Initialize the exception.
+        """Initialise the exception.
         Args:
             message:
@@ -93,7 +93,7 @@ class FlashAttentionNotInstalled(Exception):
             "pip install flash-attn --no-build-isolation`."
         ),
     ) -> None:
-        """Initialize the exception.
+        """Initialise the exception.
         Args:
             message:
@@ -107,7 +107,7 @@ class NeedsExtraInstalled(InvalidModel):
     """The evaluation requires extra to be installed."""
     def __init__(self, extra: str) -> None:
-        """Initialize the exception.
+        """Initialise the exception.
         Args:
             extra:
@@ -126,7 +126,7 @@ class NeedsManualDependency(InvalidModel):
     """The evaluation requires a dependency to be manually installed."""
     def __init__(self, package: str) -> None:
-        """Initialize the exception.
+        """Initialise the exception.
         Args:
             package:
@@ -146,7 +146,7 @@ class NeedsAdditionalArgument(InvalidModel):
     def __init__(
         self, cli_argument: str, script_argument: str, run_with_cli: bool
     ) -> None:
-        """Initialize the exception.
+        """Initialise the exception.
         Args:
             cli_argument:
@@ -177,7 +177,7 @@ class NeedsEnvironmentVariable(InvalidModel):
     """The evaluation requires an environment variable to be set."""
     def __init__(self, env_var: str) -> None:
-        """Initialize the exception.
+        """Initialise the exception.
         Args:
             env_var:

euroeval/finetuning.py CHANGED Viewed

@@ -7,14 +7,13 @@ import typing as t
 import torch
 from datasets import DatasetDict
 from tqdm.auto import tqdm
-from transformers import (
+from transformers.trainer_callback import (
     EarlyStoppingCallback,
-    IntervalStrategy,
     PrinterCallback,
     ProgressCallback,
-    TrainingArguments,
 )
-from transformers.trainer import OptimizerNames
+from transformers.trainer_utils import IntervalStrategy
+from transformers.training_args import OptimizerNames, TrainingArguments
 from .benchmark_modules import BenchmarkModule
 from .callbacks import NeverLeaveProgressCallback
@@ -67,9 +66,6 @@ def finetune(
     else:
         dtype = DataType.FP32
-    # TEMP
-    dtype = DataType.FP32
     bs: int = benchmark_config.batch_size
     scores: list[dict[str, float]] = list()
     for idx in tqdm(
@@ -212,7 +208,7 @@ def finetune_single_iteration(
     if not benchmark_config.verbose:
-        def no_logging(logs: dict[str, float]) -> None:
+        def no_logging(logs: dict[str, float], start_time: float | None = None) -> None:
             return
         trainer.log = no_logging
@@ -292,7 +288,7 @@ def get_training_args(
     training_args = TrainingArguments(
         output_dir=model_config.model_cache_dir,
-        evaluation_strategy=IntervalStrategy.STEPS,
+        eval_strategy=IntervalStrategy.STEPS,
         logging_strategy=logging_strategy,
         save_strategy=IntervalStrategy.STEPS,
         eval_steps=30,
@@ -304,11 +300,11 @@ def get_training_args(
         save_total_limit=1,
         per_device_train_batch_size=batch_size,
         per_device_eval_batch_size=batch_size,
+        optim=OptimizerNames.ADAMW_TORCH,
         learning_rate=2e-5,
         warmup_ratio=0.01,
         gradient_accumulation_steps=32 // batch_size,
         load_best_model_at_end=True,
-        optim=OptimizerNames.ADAMW_TORCH,
         seed=4242 + iteration_idx,
         fp16=dtype == DataType.FP16,
         bf16=dtype == DataType.BF16,

euroeval/generation.py CHANGED Viewed

@@ -133,6 +133,7 @@ def generate_single_iteration(
     all_preds: list[str] = list()
     if len(non_cached_dataset) > 0:
+        itr: t.Iterable
         match model.batching_preference:
             case BatchingPreference.SINGLE_SAMPLE:
                 itr = tqdm(iterable=non_cached_dataset, leave=False)

euroeval/human_evaluation.py CHANGED Viewed

@@ -17,7 +17,7 @@ from .dataset_configs import SPEED_CONFIG, get_all_dataset_configs
 from .enums import GenerativeType, TaskGroup
 from .exceptions import NeedsExtraInstalled
 from .scores import aggregate_scores
-from .task_utils import (
+from .task_group_utils import (
     question_answering,
     sequence_classification,
     text_to_text,
@@ -44,7 +44,7 @@ class HumanEvaluator:
         description: str,
         dummy_model_id: str = "mistralai/Mistral-7B-v0.1",
     ) -> None:
-        """Initialize the HumanEvaluator.
+        """Initialise the HumanEvaluator.
         Args:
             annotator_id:

euroeval/languages.py CHANGED Viewed

@@ -17,6 +17,26 @@ def get_all_languages() -> dict[str, Language]:
     return {cfg.code: cfg for cfg in globals().values() if isinstance(cfg, Language)}
+### Currently Supported Lanuages ###
+DA = Language(code="da", name="Danish", _and_separator="og", _or_separator="eller")
+NL = Language(code="nl", name="Dutch", _and_separator="en", _or_separator="of")
+EN = Language(code="en", name="English", _and_separator="and", _or_separator="or")
+FO = Language(code="fo", name="Faroese", _and_separator="og", _or_separator="ella")
+FR = Language(code="fr", name="French", _and_separator="et", _or_separator="ou")
+DE = Language(code="de", name="German", _and_separator="und", _or_separator="oder")
+IS = Language(code="is", name="Icelandic", _and_separator="og", _or_separator="eða")
+IT = Language(code="it", name="Italian", _and_separator="e", _or_separator="o")
+NO = Language(code="no", name="Norwegian", _and_separator="og", _or_separator="eller")
+NB = Language(
+    code="nb", name="Norwegian Bokmål", _and_separator="og", _or_separator="eller"
+)
+NN = Language(
+    code="nn", name="Norwegian Nynorsk", _and_separator="og", _or_separator="eller"
+)
+ES = Language(code="es", name="Spanish", _and_separator="y", _or_separator="o")
+SV = Language(code="sv", name="Swedish", _and_separator="och", _or_separator="eller")
 AB = Language(code="ab", name="Abkhazian")
 AA = Language(code="aa", name="Afar")
 AF = Language(code="af", name="Afrikaans")
@@ -52,25 +72,19 @@ CO = Language(code="co", name="Corsican")
 CR = Language(code="cr", name="Cree")
 HR = Language(code="hr", name="Croatian")
 CS = Language(code="cs", name="Czech")
-DA = Language(code="da", name="Danish")
 DV = Language(code="dv", name="Divehi")
-NL = Language(code="nl", name="Dutch")
 DZ = Language(code="dz", name="Dzongkha")
-EN = Language(code="en", name="English")
 EO = Language(code="eo", name="Esperanto")
 ET = Language(code="et", name="Estonian")
 EE = Language(code="ee", name="Ewe")
-FO = Language(code="fo", name="Faroese")
 FJ = Language(code="fj", name="Fijian")
 FI = Language(code="fi", name="Finnish")
-FR = Language(code="fr", name="French")
 FY = Language(code="fy", name="Western Frisian")
 FF = Language(code="ff", name="Fulah")
 GD = Language(code="gd", name="Gaelic")
 GL = Language(code="gl", name="Galician")
 LG = Language(code="lg", name="Ganda")
 KA = Language(code="ka", name="Georgian")
-DE = Language(code="de", name="German")
 EL = Language(code="el", name="Greek")
 KL = Language(code="kl", name="Greenlandic")
 GN = Language(code="gn", name="Guarani")
@@ -82,7 +96,6 @@ HZ = Language(code="hz", name="Herero")
 HI = Language(code="hi", name="Hindi")
 HO = Language(code="ho", name="Hiri Motu")
 HU = Language(code="hu", name="Hungarian")
-IS = Language(code="is", name="Icelandic")
 IO = Language(code="io", name="Ido")
 IG = Language(code="ig", name="Igbo")
 ID = Language(code="id", name="Indonesian")
@@ -91,7 +104,6 @@ IE = Language(code="ie", name="Interlingue")
 IU = Language(code="iu", name="Inuktitut")
 IK = Language(code="ik", name="Inupiaq")
 GA = Language(code="ga", name="Irish")
-IT = Language(code="it", name="Italian")
 JA = Language(code="ja", name="Japanese")
 KN = Language(code="kn", name="Kannada")
 KR = Language(code="kr", name="Kanuri")
@@ -130,9 +142,6 @@ ND = Language(code="nd", name="Northern Ndebele")
 NR = Language(code="nr", name="South Ndebele")
 NG = Language(code="ng", name="Ndonga")
 NE = Language(code="ne", name="Nepali")
-NO = Language(code="no", name="Norwegian")
-NB = Language(code="nb", name="Norwegian Bokmål")
-NN = Language(code="nn", name="Norwegian Nynorsk")
 II = Language(code="ii", name="Sichuan Yi")
 OC = Language(code="oc", name="Occitan")
 OJ = Language(code="oj", name="Ojibwa")
@@ -163,11 +172,9 @@ SK = Language(code="sk", name="Slovak")
 SL = Language(code="sl", name="Slovenian")
 SO = Language(code="so", name="Somali")
 ST = Language(code="st", name="Sotho")
-ES = Language(code="es", name="Spanish")
 SU = Language(code="su", name="Sundanese")
 SW = Language(code="sw", name="Swahili")
 SS = Language(code="ss", name="Swati")
-SV = Language(code="sv", name="Swedish")
 TL = Language(code="tl", name="Tagalog")
 TY = Language(code="ty", name="Tahitian")
 TG = Language(code="tg", name="Tajik")

euroeval/model_cache.py CHANGED Viewed

@@ -38,7 +38,7 @@ class ModelCache:
     def __init__(
         self, model_cache_dir: "Path", cache_name: str, max_generated_tokens: int
     ) -> None:
-        """Initialize the model output cache.
+        """Initialise the model output cache.
         Args:
             model_cache_dir:

euroeval/model_loading.py CHANGED Viewed

@@ -8,9 +8,8 @@ from .benchmark_modules import (
     LiteLLMModel,
     VLLMModel,
 )
-from .constants import GENERATIVE_DATASET_TASK_GROUPS
 from .enums import InferenceBackend, ModelType
-from .exceptions import InvalidBenchmark, InvalidModel
+from .exceptions import InvalidModel
 if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
@@ -59,16 +58,6 @@ def load_model(
                 f"inference backend {model_config.inference_backend!r}."
             )
-    # Refuse to benchmark non-generative models on generative tasks
-    if (
-        dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
-        and not model_config.model_type == ModelType.GENERATIVE
-    ):
-        raise InvalidBenchmark(
-            f"Cannot benchmark non-generative model {model_config.model_id!r} on "
-            f"generative task {dataset_config.task.name!r}."
-        )
     model = model_class(
         model_config=model_config,
         dataset_config=dataset_config,

euroeval/prompt_templates/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""The different prompt templates used in EuroEval."""
+from .linguistic_acceptability import LA_TEMPLATES
+from .multiple_choice import MULTIPLE_CHOICE_TEMPLATES
+from .named_entity_recognition import NER_TEMPLATES
+from .reading_comprehension import RC_TEMPLATES
+from .sentiment_classification import SENT_TEMPLATES
+from .summarization import SUMM_TEMPLATES

EuroEval 15.5.0__py3-none-any.whl → 15.6.1__py3-none-any.whl

Potentially problematic release.

EuroEval 15.5.0py3-none-any.whl → 15.6.1py3-none-any.whl