PyPI - EuroEval - Versions diffs - 15.2.0__py3-none-any.whl - Mend

EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show

euroeval/__init__.py +72 -0
euroeval/benchmark_config_factory.py +358 -0
euroeval/benchmark_modules/__init__.py +7 -0
euroeval/benchmark_modules/base.py +354 -0
euroeval/benchmark_modules/fresh.py +286 -0
euroeval/benchmark_modules/hf.py +1185 -0
euroeval/benchmark_modules/litellm.py +905 -0
euroeval/benchmark_modules/vllm.py +1171 -0
euroeval/benchmarker.py +1074 -0
euroeval/callbacks.py +72 -0
euroeval/cli.py +281 -0
euroeval/constants.py +50 -0
euroeval/data_loading.py +96 -0
euroeval/data_models.py +474 -0
euroeval/dataset_configs.py +2001 -0
euroeval/enums.py +144 -0
euroeval/exceptions.py +191 -0
euroeval/finetuning.py +324 -0
euroeval/generation.py +296 -0
euroeval/human_evaluation.py +737 -0
euroeval/languages.py +200 -0
euroeval/model_cache.py +253 -0
euroeval/model_config.py +77 -0
euroeval/model_loading.py +78 -0
euroeval/scores.py +90 -0
euroeval/speed_benchmark.py +124 -0
euroeval/task_utils/__init__.py +1 -0
euroeval/task_utils/multiple_choice_classification.py +176 -0
euroeval/task_utils/question_answering.py +698 -0
euroeval/task_utils/sequence_classification.py +237 -0
euroeval/task_utils/text_to_text.py +150 -0
euroeval/task_utils/token_classification.py +464 -0
euroeval/tasks.py +202 -0
euroeval/types.py +97 -0
euroeval/utils.py +574 -0
euroeval-15.2.0.dist-info/METADATA +234 -0
euroeval-15.2.0.dist-info/RECORD +40 -0
euroeval-15.2.0.dist-info/WHEEL +4 -0
euroeval-15.2.0.dist-info/entry_points.txt +4 -0
euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0

euroeval/languages.py ADDED Viewed

@@ -0,0 +1,200 @@
+"""List of languages and their ISO 639-1 codes.
+Taken from https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes.
+Last updated 19 June 2022.
+"""
+from .data_models import Language
+def get_all_languages() -> dict[str, Language]:
+    """Get a list of all the languages.
+    Returns:
+        A mapping between language codes and their configurations.
+    """
+    return {cfg.code: cfg for cfg in globals().values() if isinstance(cfg, Language)}
+AB = Language(code="ab", name="Abkhazian")
+AA = Language(code="aa", name="Afar")
+AF = Language(code="af", name="Afrikaans")
+SQ = Language(code="sq", name="Albanian")
+AM = Language(code="am", name="Amharic")
+AR = Language(code="ar", name="Arabic")
+AN = Language(code="an", name="Aragonese")
+HY = Language(code="hy", name="Armenian")
+AS = Language(code="as", name="Assamese")
+AV = Language(code="av", name="Avaric")
+AE = Language(code="ae", name="Avestan")
+AY = Language(code="ay", name="Aymara")
+AZ = Language(code="az", name="Azerbaijani")
+BM = Language(code="bm", name="Bambara")
+BA = Language(code="ba", name="Bashkir")
+EU = Language(code="eu", name="Basque")
+BE = Language(code="be", name="Belarusian")
+BN = Language(code="bn", name="Bengali")
+BI = Language(code="bi", name="Bislama")
+BS = Language(code="bs", name="Bosnian")
+BR = Language(code="br", name="Breton")
+BG = Language(code="bg", name="Bulgarian")
+MY = Language(code="my", name="Burmese")
+CA = Language(code="ca", name="Catalan")
+CH = Language(code="ch", name="Chamorro")
+CE = Language(code="ce", name="Chechen")
+NY = Language(code="ny", name="Chichewa")
+ZH = Language(code="zh", name="Chinese")
+CU = Language(code="cu", name="Church Slavic")
+CV = Language(code="cv", name="Chuvash")
+KW = Language(code="kw", name="Cornish")
+CO = Language(code="co", name="Corsican")
+CR = Language(code="cr", name="Cree")
+HR = Language(code="hr", name="Croatian")
+CS = Language(code="cs", name="Czech")
+DA = Language(code="da", name="Danish")
+DV = Language(code="dv", name="Divehi")
+NL = Language(code="nl", name="Dutch")
+DZ = Language(code="dz", name="Dzongkha")
+EN = Language(code="en", name="English")
+EO = Language(code="eo", name="Esperanto")
+ET = Language(code="et", name="Estonian")
+EE = Language(code="ee", name="Ewe")
+FO = Language(code="fo", name="Faroese")
+FJ = Language(code="fj", name="Fijian")
+FI = Language(code="fi", name="Finnish")
+FR = Language(code="fr", name="French")
+FY = Language(code="fy", name="Western Frisian")
+FF = Language(code="ff", name="Fulah")
+GD = Language(code="gd", name="Gaelic")
+GL = Language(code="gl", name="Galician")
+LG = Language(code="lg", name="Ganda")
+KA = Language(code="ka", name="Georgian")
+DE = Language(code="de", name="German")
+EL = Language(code="el", name="Greek")
+KL = Language(code="kl", name="Greenlandic")
+GN = Language(code="gn", name="Guarani")
+GU = Language(code="gu", name="Gujarati")
+HT = Language(code="ht", name="Haitian")
+HA = Language(code="ha", name="Hausa")
+HE = Language(code="he", name="Hebrew")
+HZ = Language(code="hz", name="Herero")
+HI = Language(code="hi", name="Hindi")
+HO = Language(code="ho", name="Hiri Motu")
+HU = Language(code="hu", name="Hungarian")
+IS = Language(code="is", name="Icelandic")
+IO = Language(code="io", name="Ido")
+IG = Language(code="ig", name="Igbo")
+ID = Language(code="id", name="Indonesian")
+IA = Language(code="ia", name="Interlingua")
+IE = Language(code="ie", name="Interlingue")
+IU = Language(code="iu", name="Inuktitut")
+IK = Language(code="ik", name="Inupiaq")
+GA = Language(code="ga", name="Irish")
+IT = Language(code="it", name="Italian")
+JA = Language(code="ja", name="Japanese")
+KN = Language(code="kn", name="Kannada")
+KR = Language(code="kr", name="Kanuri")
+KS = Language(code="ks", name="Kashmiri")
+KK = Language(code="kk", name="Kazakh")
+KM = Language(code="km", name="Central Khmer")
+KI = Language(code="ki", name="Kikuyu")
+RW = Language(code="rw", name="Kinyarwanda")
+KY = Language(code="ky", name="Kirghiz")
+KV = Language(code="kv", name="Komi")
+KG = Language(code="kg", name="Kongo")
+KO = Language(code="ko", name="Korean")
+KJ = Language(code="kj", name="Kuanyama")
+KU = Language(code="ku", name="Kurdish")
+LO = Language(code="lo", name="Lao")
+LA = Language(code="la", name="Latin")
+LV = Language(code="lv", name="Latvian")
+LI = Language(code="li", name="Limburgan")
+LN = Language(code="ln", name="Lingala")
+LT = Language(code="lt", name="Lithuanian")
+LU = Language(code="lu", name="Luba-Katanga")
+LB = Language(code="lb", name="Luxembourgish")
+MK = Language(code="mk", name="Macedonian")
+MG = Language(code="mg", name="Malagasy")
+MS = Language(code="ms", name="Malay")
+ML = Language(code="ml", name="Malayalam")
+MT = Language(code="mt", name="Maltese")
+GV = Language(code="gv", name="Manx")
+MI = Language(code="mi", name="Maori")
+MR = Language(code="mr", name="Marathi")
+MH = Language(code="mh", name="Marshallese")
+MN = Language(code="mn", name="Mongolian")
+NA = Language(code="na", name="Nauru")
+NV = Language(code="nv", name="Navajo")
+ND = Language(code="nd", name="Northern Ndebele")
+NR = Language(code="nr", name="South Ndebele")
+NG = Language(code="ng", name="Ndonga")
+NE = Language(code="ne", name="Nepali")
+NO = Language(code="no", name="Norwegian")
+NB = Language(code="nb", name="Norwegian Bokmål")
+NN = Language(code="nn", name="Norwegian Nynorsk")
+II = Language(code="ii", name="Sichuan Yi")
+OC = Language(code="oc", name="Occitan")
+OJ = Language(code="oj", name="Ojibwa")
+OR = Language(code="or", name="Oriya")
+OM = Language(code="om", name="Oromo")
+OS = Language(code="os", name="Ossetian")
+PI = Language(code="pi", name="Pali")
+PS = Language(code="ps", name="Pashto")
+FA = Language(code="fa", name="Persian")
+PL = Language(code="pl", name="Polish")
+PT = Language(code="pt", name="Portuguese")
+PA = Language(code="pa", name="Punjabi")
+QU = Language(code="qu", name="Quechua")
+RO = Language(code="ro", name="Romanian")
+RM = Language(code="rm", name="Romansh")
+RN = Language(code="rn", name="Rundi")
+RU = Language(code="ru", name="Russian")
+SE = Language(code="se", name="Northern Sami")
+SM = Language(code="sm", name="Samoan")
+SG = Language(code="sg", name="Sango")
+SA = Language(code="sa", name="Sanskrit")
+SC = Language(code="sc", name="Sardinian")
+SR = Language(code="sr", name="Serbian")
+SN = Language(code="sn", name="Shona")
+SD = Language(code="sd", name="Sindhi")
+SI = Language(code="si", name="Sinhala")
+SK = Language(code="sk", name="Slovak")
+SL = Language(code="sl", name="Slovenian")
+SO = Language(code="so", name="Somali")
+ST = Language(code="st", name="Sotho")
+ES = Language(code="es", name="Spanish")
+SU = Language(code="su", name="Sundanese")
+SW = Language(code="sw", name="Swahili")
+SS = Language(code="ss", name="Swati")
+SV = Language(code="sv", name="Swedish")
+TL = Language(code="tl", name="Tagalog")
+TY = Language(code="ty", name="Tahitian")
+TG = Language(code="tg", name="Tajik")
+TA = Language(code="ta", name="Tamil")
+TT = Language(code="tt", name="Tatar")
+TE = Language(code="te", name="Telugu")
+TH = Language(code="th", name="Thai")
+BO = Language(code="bo", name="Tibetan")
+TI = Language(code="ti", name="Tigrinya")
+TO = Language(code="to", name="Tonga")
+TS = Language(code="ts", name="Tsonga")
+TN = Language(code="tn", name="Tswana")
+TR = Language(code="tr", name="Turkish")
+TK = Language(code="tk", name="Turkmen")
+TW = Language(code="tw", name="Twi")
+UG = Language(code="ug", name="Uighur")
+UK = Language(code="uk", name="Ukrainian")
+UR = Language(code="ur", name="Urdu")
+UZ = Language(code="uz", name="Uzbek")
+VE = Language(code="ve", name="Venda")
+VI = Language(code="vi", name="Vietnamese")
+VO = Language(code="vo", name="Volapük")
+WA = Language(code="wa", name="Walloon")
+CY = Language(code="cy", name="Welsh")
+WO = Language(code="wo", name="Wolof")
+XH = Language(code="xh", name="Xhosa")
+YI = Language(code="yi", name="Yiddish")
+YO = Language(code="yo", name="Yoruba")
+ZA = Language(code="za", name="Zhuang")
+ZU = Language(code="zu", name="Zulu")

euroeval/model_cache.py ADDED Viewed

@@ -0,0 +1,253 @@
+"""ModelCache class for caching model outputs."""
+import hashlib
+import json
+import logging
+import sys
+import typing as t
+from collections import defaultdict
+from dataclasses import asdict
+from tqdm.auto import tqdm
+from .data_models import GenerativeModelOutput, SingleGenerativeModelOutput
+if t.TYPE_CHECKING:
+    from pathlib import Path
+    from datasets import Dataset
+logger = logging.getLogger("euroeval")
+class ModelCache:
+    """A cache for model outputs.
+    Attributes:
+        model_cache_dir:
+            The directory to store the cache in.
+        cache_path:
+            The path to the cache file.
+        cache:
+            The model output cache.
+        max_generated_tokens:
+            The maximum number of tokens to generate for each example.
+    """
+    def __init__(
+        self, model_cache_dir: "Path", cache_name: str, max_generated_tokens: int
+    ) -> None:
+        """Initialize the model output cache.
+        Args:
+            model_cache_dir:
+                The directory to store the cache in.
+            cache_name:
+                The name of the cache file.
+            max_generated_tokens:
+                The maximum number of tokens to generate for each example.
+        """
+        self.model_cache_dir = model_cache_dir
+        self.model_cache_dir.mkdir(parents=True, exist_ok=True)
+        self.cache_path = self.model_cache_dir / cache_name.replace("/", "--")
+        self.max_generated_tokens = max_generated_tokens
+    def load(self) -> None:
+        """Load the model output cache."""
+        if not self.cache_path.exists():
+            with self.cache_path.open("w") as f:
+                json.dump(dict(), f)
+        try:
+            with self.cache_path.open() as f:
+                json_cache = json.load(f)
+        except json.JSONDecodeError:
+            logger.warning(
+                f"Failed to load the cache from {self.cache_path}. The cache will be "
+                f"re-initialised."
+            )
+            json_cache = dict()
+            with self.cache_path.open("w") as f:
+                json.dump(dict(), f)
+        cache: dict[str, SingleGenerativeModelOutput] = dict()
+        for key in json_cache:
+            cache[key] = SingleGenerativeModelOutput(**json_cache[key])
+        self.cache = cache
+    def save(self) -> None:
+        """Save the model output cache to disk."""
+        dumpable_cache: dict[str, dict] = defaultdict(dict)
+        for key, value in self.cache.items():
+            dumpable_cache[key] = asdict(value)
+        try:
+            with self.cache_path.open("w") as f:
+                json.dump(dumpable_cache, f)
+        except KeyError:
+            logger.warning(
+                f"Failed to load the cache from {self.cache_path}. The cache will be "
+                f"re-initialised."
+            )
+            self.cache = dict()
+            with self.cache_path.open("w") as f:
+                json.dump(dict(), f)
+    def _hash_key(self, key: str | list[dict[str, str]]) -> str:
+        """Hash the key to use as an index in the cache.
+        Args:
+            key:
+                The key to hash.
+        Returns:
+            The hashed key.
+        """
+        return hashlib.md5(string=str(key).encode()).hexdigest()
+    def __getitem__(
+        self, key: str | list[dict[str, str]]
+    ) -> SingleGenerativeModelOutput:
+        """Get an item from the cache.
+        Args:
+            key:
+                The key to use to index the cache.
+        Returns:
+            The model output.
+        """
+        hashed_key = self._hash_key(key=key)
+        return self.cache[hashed_key]
+    def __setitem__(
+        self, key: str | list[dict[str, str]], value: SingleGenerativeModelOutput
+    ) -> None:
+        """Set an item in the cache.
+        Args:
+            key:
+                The key to use to index the cache.
+            value:
+                The value to set in the cache.
+        """
+        hashed_key = self._hash_key(key=key)
+        self.cache[hashed_key] = value
+    def remove(self) -> None:
+        """Remove the cache from memory and delete it from disk."""
+        self.cache_path.unlink()
+        del self.cache
+    def __contains__(self, key: str | list[dict[str, str]]) -> bool:
+        """Check if a key is in the cache.
+        Args:
+            key:
+                The key to check.
+        Returns:
+            Whether the key is in the cache.
+        """
+        hashed_key = self._hash_key(key=key)
+        return hashed_key in self.cache
+    def add_to_cache(
+        self, model_inputs: dict, model_output: GenerativeModelOutput
+    ) -> None:
+        """Add the model input/output to the cache.
+        Args:
+            model_inputs:
+                The model inputs.
+            model_output:
+                The model output.
+        """
+        input_column = "messages" if "messages" in model_inputs else "text"
+        model_inputs = model_inputs[input_column]
+        # Store the generated sequences in the cache, one by one
+        with tqdm(
+            iterable=model_inputs,
+            desc="Caching model outputs",
+            leave=False,
+            disable=hasattr(sys, "_called_from_test"),
+        ) as pbar:
+            for sample_idx, model_input in enumerate(pbar):
+                # Extract the scores from the model output, to be cached. We only store
+                # the indices of the top scores, to save space. Further, we only store
+                # the scores if the generated sequence is shorter than the maximum
+                # length
+                if model_output.scores is not None and self.max_generated_tokens < 8:
+                    assert model_output.scores is not None
+                    scores = model_output.scores[sample_idx]
+                else:
+                    scores = None
+                self[model_input] = SingleGenerativeModelOutput(
+                    sequence=model_output.sequences[sample_idx], scores=scores
+                )
+def split_dataset_into_cached_and_non_cached(
+    dataset: "Dataset", cache: ModelCache
+) -> tuple["Dataset", "Dataset"]:
+    """Split a dataset into a cached and non-cached part.
+    Args:
+        dataset:
+            The dataset to split.
+        cache:
+            The model output cache.
+    Returns:
+        The cached and non-cached parts of the dataset.
+    """
+    # Get the sample indices of the non-cached examples, which are unique with respect
+    # to the "text" column.
+    input_column = "messages" if "messages" in dataset.column_names else "text"
+    dataset_texts = dataset[input_column]
+    unique_non_cached_ids = set()
+    unique_texts = list()
+    for idx, dataset_text in enumerate(dataset_texts):
+        if dataset_text not in cache and dataset_text not in unique_texts:
+            unique_non_cached_ids.add(idx)
+            unique_texts.append(dataset_text)
+    # The cached examples are the ones that are not in the non-cached examples. This
+    # means that if the dataset has duplicates, only a single copy of the duplicate
+    # will be put in the non-cached part, and the rest in the cached part.
+    cached_ids = set(range(len(dataset))) - unique_non_cached_ids
+    cached = dataset.select(cached_ids)
+    non_cached = dataset.select(unique_non_cached_ids)
+    return cached, non_cached
+def load_cached_model_outputs(
+    cached_dataset: "Dataset", cache: ModelCache
+) -> GenerativeModelOutput:
+    """Load the cached model outputs.
+    Args:
+        cached_dataset:
+            The dataset containing the cached examples.
+        cache:
+            The model output cache.
+    Returns:
+        The model output containing the cached sequences.
+    """
+    input_column = "messages" if "messages" in cached_dataset.column_names else "text"
+    cached_model_outputs: list[SingleGenerativeModelOutput] = [
+        cache[prompt] for prompt in cached_dataset[input_column]
+    ]
+    cached_sequences = [model_output.sequence for model_output in cached_model_outputs]
+    if cached_model_outputs[0].scores is None:
+        return GenerativeModelOutput(sequences=cached_sequences)
+    cached_scores = [model_output.scores or [] for model_output in cached_model_outputs]
+    return GenerativeModelOutput(sequences=cached_sequences, scores=cached_scores)

euroeval/model_config.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""Functions related to getting the model configuration."""
+import logging
+import typing as t
+from . import benchmark_modules
+from .exceptions import InvalidModel, NeedsEnvironmentVariable, NeedsExtraInstalled
+if t.TYPE_CHECKING:
+    from .data_models import BenchmarkConfig, ModelConfig
+logger = logging.getLogger("euroeval")
+def get_model_config(
+    model_id: str, benchmark_config: "BenchmarkConfig"
+) -> "ModelConfig":
+    """Fetches configuration for a model.
+    Args:
+        model_id:
+            The model ID.
+        benchmark_config:
+            The configuration of the benchmark.
+    Returns:
+        The model configuration.
+    Raises:
+        InvalidModel:
+            If all model setups can handle the model, but the model does not exist.
+    """
+    all_benchmark_modules = [
+        cls
+        for cls in benchmark_modules.__dict__.values()
+        if isinstance(cls, type)
+        and issubclass(cls, benchmark_modules.BenchmarkModule)
+        and cls is not benchmark_modules.BenchmarkModule
+    ]
+    all_benchmark_modules.sort(key=lambda cls: cls.high_priority, reverse=True)
+    needs_extras: list[str] = list()
+    needs_env_vars: list[str] = list()
+    for benchmark_module in all_benchmark_modules:
+        exists_or_err = benchmark_module.model_exists(
+            model_id=model_id, benchmark_config=benchmark_config
+        )
+        if isinstance(exists_or_err, NeedsExtraInstalled):
+            needs_extras.append(exists_or_err.extra)
+        elif isinstance(exists_or_err, NeedsEnvironmentVariable):
+            needs_env_vars.append(exists_or_err.env_var)
+        elif exists_or_err is True:
+            logger.debug(
+                f"The model {model_id!r} was identified by the "
+                f"{benchmark_module.__name__} benchmark module."
+            )
+            model_config = benchmark_module.get_model_config(
+                model_id=model_id, benchmark_config=benchmark_config
+            )
+            return model_config
+    else:
+        msg = f"Model {model_id} not found."
+        if needs_extras:
+            msg += (
+                " However, it is possible that the model exists, but a package "
+                "needs to be installed to check if it exists. Please try running "
+                f"`pip install euroeval[{','.join(needs_extras)}]` or `pip install "
+                "euroeval[all]`, and try again."
+            )
+        elif needs_env_vars:
+            msg += (
+                " However, it is possible that the model exists, but an environment "
+                "variable needs to be set to check if it exists. Please set the "
+                f"environment variables {','.join(needs_env_vars)} and try again."
+            )
+        raise InvalidModel(msg)

euroeval/model_loading.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""Functions related to the loading of models."""
+import typing as t
+from .benchmark_modules import (
+    FreshEncoderModel,
+    HuggingFaceEncoderModel,
+    LiteLLMModel,
+    VLLMModel,
+)
+from .constants import GENERATIVE_DATASET_TASK_GROUPS
+from .enums import InferenceBackend, ModelType
+from .exceptions import InvalidBenchmark, InvalidModel
+if t.TYPE_CHECKING:
+    from .benchmark_modules import BenchmarkModule
+    from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
+def load_model(
+    model_config: "ModelConfig",
+    dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
+) -> "BenchmarkModule":
+    """Load a model.
+    Args:
+        model_config:
+            The model configuration.
+        dataset_config:
+            The dataset configuration.
+        benchmark_config:
+            The benchmark configuration.
+    Returns:
+        The model.
+    """
+    # The order matters; the first model type that matches will be used. For this
+    # reason, they have been ordered in terms of the most common model types.
+    model_class: t.Type[BenchmarkModule]
+    match (model_config.model_type, model_config.inference_backend, model_config.fresh):
+        case (ModelType.GENERATIVE, InferenceBackend.VLLM, False):
+            model_class = VLLMModel
+        case (ModelType.ENCODER, InferenceBackend.TRANSFORMERS, False):
+            model_class = HuggingFaceEncoderModel
+        case (ModelType.GENERATIVE, InferenceBackend.LITELLM, False):
+            model_class = LiteLLMModel
+        case (ModelType.ENCODER, InferenceBackend.TRANSFORMERS, True):
+            model_class = FreshEncoderModel
+        case (_, _, True):
+            raise InvalidModel(
+                "Cannot load a freshly initialised model with the model type "
+                f"{model_config.model_type!r} and inference backend "
+                f"{model_config.inference_backend!r}."
+            )
+        case _:
+            raise InvalidModel(
+                f"Cannot load model with model type {model_config.model_type!r} and "
+                f"inference backend {model_config.inference_backend!r}."
+            )
+    # Refuse to benchmark non-generative models on generative tasks
+    if (
+        dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
+        and not model_config.model_type == ModelType.GENERATIVE
+    ):
+        raise InvalidBenchmark(
+            f"Cannot benchmark non-generative model {model_config.model_id!r} on "
+            f"generative task {dataset_config.task.name!r}."
+        )
+    model = model_class(
+        model_config=model_config,
+        dataset_config=dataset_config,
+        benchmark_config=benchmark_config,
+    )
+    return model