PyPI - EuroEval - Versions diffs - 15.2.0__py3-none-any.whl - Mend

EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show

euroeval/__init__.py +72 -0
euroeval/benchmark_config_factory.py +358 -0
euroeval/benchmark_modules/__init__.py +7 -0
euroeval/benchmark_modules/base.py +354 -0
euroeval/benchmark_modules/fresh.py +286 -0
euroeval/benchmark_modules/hf.py +1185 -0
euroeval/benchmark_modules/litellm.py +905 -0
euroeval/benchmark_modules/vllm.py +1171 -0
euroeval/benchmarker.py +1074 -0
euroeval/callbacks.py +72 -0
euroeval/cli.py +281 -0
euroeval/constants.py +50 -0
euroeval/data_loading.py +96 -0
euroeval/data_models.py +474 -0
euroeval/dataset_configs.py +2001 -0
euroeval/enums.py +144 -0
euroeval/exceptions.py +191 -0
euroeval/finetuning.py +324 -0
euroeval/generation.py +296 -0
euroeval/human_evaluation.py +737 -0
euroeval/languages.py +200 -0
euroeval/model_cache.py +253 -0
euroeval/model_config.py +77 -0
euroeval/model_loading.py +78 -0
euroeval/scores.py +90 -0
euroeval/speed_benchmark.py +124 -0
euroeval/task_utils/__init__.py +1 -0
euroeval/task_utils/multiple_choice_classification.py +176 -0
euroeval/task_utils/question_answering.py +698 -0
euroeval/task_utils/sequence_classification.py +237 -0
euroeval/task_utils/text_to_text.py +150 -0
euroeval/task_utils/token_classification.py +464 -0
euroeval/tasks.py +202 -0
euroeval/types.py +97 -0
euroeval/utils.py +574 -0
euroeval-15.2.0.dist-info/METADATA +234 -0
euroeval-15.2.0.dist-info/RECORD +40 -0
euroeval-15.2.0.dist-info/WHEEL +4 -0
euroeval-15.2.0.dist-info/entry_points.txt +4 -0
euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0

euroeval/benchmark_modules/base.py ADDED Viewed

@@ -0,0 +1,354 @@
+"""Abstract benchmark module class that the model classes inherit from."""
+import collections.abc as c
+import logging
+import sys
+import typing as t
+from abc import ABC, abstractmethod
+from functools import cached_property, partial
+from datasets import DatasetDict
+from torch import nn
+from tqdm.auto import tqdm
+from transformers import PreTrainedTokenizer, Trainer
+from ..data_models import (
+    BenchmarkConfig,
+    DatasetConfig,
+    GenerativeModelOutput,
+    ModelConfig,
+    Task,
+)
+from ..enums import BatchingPreference, GenerativeType, TaskGroup
+from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
+from ..task_utils import (
+    question_answering,
+    sequence_classification,
+    text_to_text,
+    token_classification,
+)
+from ..types import ComputeMetricsFunction, ExtractLabelsFunction
+from ..utils import log_once
+logger = logging.getLogger("euroeval")
+class BenchmarkModule(ABC):
+    """Abstract class for a benchmark module.
+    Attributes:
+        model_config:
+            The model configuration.
+        dataset_config:
+            The dataset configuration.
+        benchmark_config:
+            The benchmark configuration.
+        buffer:
+            A buffer to store temporary data.
+    """
+    fresh_model: bool
+    batching_preference: BatchingPreference
+    high_priority: bool
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        dataset_config: DatasetConfig,
+        benchmark_config: BenchmarkConfig,
+    ) -> None:
+        """Initialise the benchmark module.
+        Args:
+            model_config:
+                The model configuration.
+            dataset_config:
+                The dataset configuration.
+            benchmark_config:
+                The benchmark configuration.
+        """
+        self.model_config = model_config
+        self.dataset_config = dataset_config
+        self.benchmark_config = benchmark_config
+        self.buffer: dict[str, t.Any] = dict()
+        self._log_metadata()
+    def _log_metadata(self) -> None:
+        """Log the metadata of the model."""
+        # Set logging level based on verbosity
+        if hasattr(sys, "_called_from_test"):
+            logging_level = logging.CRITICAL
+        elif self.benchmark_config.verbose:
+            logging_level = logging.DEBUG
+        else:
+            logging_level = logging.INFO
+        logger.setLevel(logging_level)
+        logging_msg: str = ""
+        if self.num_params < 0:
+            logging_msg += "The model has an unknown number of parameters, "
+        else:
+            logging_msg += f"The model has {self.num_params:,} parameters, "
+        if self.vocab_size < 0:
+            logging_msg += "an unknown vocabulary size, "
+        else:
+            logging_msg += f"a vocabulary size of {self.vocab_size:,}, "
+        if self.model_max_length < 0:
+            logging_msg += "and an unknown maximum sequence length."
+        else:
+            logging_msg += f"and a maximum context length of {self.model_max_length:,}."
+        log_once(message=logging_msg, level=logging.INFO)
+    def get_pytorch_module(self) -> "nn.Module":
+        """Get the underlying PyTorch module.
+        Returns:
+            The PyTorch module.
+        """
+        if hasattr(self, "_model"):
+            return self._model
+        raise NotImplementedError(
+            "The `get_pytorch_module` method has not been implemented for "
+            f"{self.__class__.__name__}."
+        )
+    def get_tokenizer(self) -> "PreTrainedTokenizer":
+        """Get the underlying tokenizer.
+        Returns:
+            The tokenizer.
+        """
+        if hasattr(self, "_tokenizer"):
+            return self._tokenizer
+        raise NotImplementedError(
+            "The `get_tokenizer` method has not been implemented for "
+            f"{self.__class__.__name__}."
+        )
+    @cached_property
+    @abstractmethod
+    def num_params(self) -> int:
+        """The number of parameters in the model.
+        Returns:
+            The number of parameters in the model.
+        """
+        ...
+    @property
+    @abstractmethod
+    def generative_type(self) -> GenerativeType | None:
+        """Get the generative type of the model.
+        Returns:
+            The generative type of the model, or None if the model is not generative.
+        """
+        ...
+    @cached_property
+    @abstractmethod
+    def vocab_size(self) -> int:
+        """The vocabulary size of the model.
+        Returns:
+            The vocabulary size of the model.
+        """
+        ...
+    @cached_property
+    @abstractmethod
+    def model_max_length(self) -> int:
+        """The maximum length of the model.
+        Returns:
+            The maximum length of the model.
+        """
+        ...
+    @property
+    @abstractmethod
+    def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
+        """The data collator used to prepare samples during finetuning.
+        Returns:
+            The data collator.
+        """
+        ...
+    @property
+    def compute_metrics(self) -> ComputeMetricsFunction:
+        """The function used to compute the metrics.
+        Returns:
+            The function used to compute the metrics.
+        """
+        match self.dataset_config.task.task_group:
+            case TaskGroup.SEQUENCE_CLASSIFICATION:
+                return partial(
+                    sequence_classification.compute_metrics,
+                    dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
+                )
+            case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
+                return partial(
+                    sequence_classification.compute_metrics,
+                    dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
+                )
+            case TaskGroup.TEXT_TO_TEXT:
+                return partial(
+                    text_to_text.compute_metrics,
+                    dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
+                )
+            case TaskGroup.TOKEN_CLASSIFICATION:
+                return partial(
+                    token_classification.compute_metrics,
+                    has_misc_tags=self.buffer.get("has_misc_tags", True),
+                    dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
+                )
+            case TaskGroup.QUESTION_ANSWERING:
+                return partial(
+                    question_answering.compute_metrics,
+                    dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
+                )
+            case _:
+                raise NotImplementedError(
+                    f"Unsupported task group: {self.dataset_config.task.task_group}."
+                )
+    @property
+    @abstractmethod
+    def extract_labels_from_generation(self) -> ExtractLabelsFunction:
+        """The function used to extract the labels from the generated output.
+        Returns:
+            The function used to extract the labels from the generated output.
+        """
+        ...
+    @property
+    @abstractmethod
+    def trainer_class(self) -> t.Type["Trainer"]:
+        """The Trainer class to use for finetuning.
+        Returns:
+            The Trainer class.
+        """
+        ...
+    def prepare_datasets(
+        self, datasets: list[DatasetDict], task: Task
+    ) -> list[DatasetDict]:
+        """Prepare the datasets for the model.
+        This includes things like tokenisation.
+        Args:
+            datasets:
+                The datasets to prepare.
+            task:
+                The task to prepare the datasets for.
+        Returns:
+            The prepared datasets.
+        """
+        for idx, dataset in enumerate(
+            tqdm(iterable=datasets, desc="Preparing datasets")
+        ):
+            prepared_dataset = self.prepare_dataset(
+                dataset=dataset, task=task, itr_idx=idx
+            )
+            if self.dataset_config.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
+                labels_in_train: set[str] = {
+                    tag for tag_list in dataset["train"]["labels"] for tag in tag_list
+                }
+                self.buffer["has_misc_tags"] = (
+                    "B-MISC" in labels_in_train or "I-MISC" in labels_in_train
+                )
+            datasets[idx] = DatasetDict(
+                dict(
+                    train=prepared_dataset["train"],
+                    val=prepared_dataset["val"],
+                    test=prepared_dataset["test"],
+                    original_train=dataset["train"],
+                    original_val=dataset["val"],
+                    original_test=dataset["test"],
+                )
+            )
+        return datasets
+    @abstractmethod
+    def prepare_dataset(
+        self, dataset: DatasetDict, task: Task, itr_idx: int
+    ) -> DatasetDict:
+        """Prepare the dataset for the model.
+        This includes things like tokenisation.
+        Args:
+            dataset:
+                The dataset to prepare.
+            task:
+                The task to prepare the dataset for.
+            itr_idx:
+                The index of the dataset in the iterator.
+        Returns:
+            The prepared dataset.
+        """
+        ...
+    def generate(self, inputs: dict) -> GenerativeModelOutput:
+        """Generate outputs from the model.
+        Args:
+            inputs:
+                A batch of inputs to pass through the model.
+        Returns:
+            The generated model outputs.
+        """
+        raise NotImplementedError(
+            "The `generate` method has not been implemented for "
+            f"{self.__class__.__name__}."
+        )
+    @classmethod
+    @abstractmethod
+    def model_exists(
+        cls, model_id: str, benchmark_config: BenchmarkConfig
+    ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
+        """Check if a model exists.
+        Args:
+            model_id:
+                The model ID.
+            benchmark_config:
+                The benchmark configuration.
+        Returns:
+            Whether the model exists, or an error describing why we cannot check
+            whether the model exists.
+        """
+        ...
+    @classmethod
+    @abstractmethod
+    def get_model_config(
+        cls, model_id: str, benchmark_config: BenchmarkConfig
+    ) -> ModelConfig:
+        """Fetch the model configuration.
+        Args:
+            model_id:
+                The model ID.
+            benchmark_config:
+                The benchmark configuration.
+        Returns:
+            The model configuration.
+        """
+        ...

euroeval/benchmark_modules/fresh.py ADDED Viewed

@@ -0,0 +1,286 @@
+"""Freshly initialised encoder models."""
+import os
+from functools import cached_property
+from json import JSONDecodeError
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    ElectraForQuestionAnswering,
+    ElectraForSequenceClassification,
+    ElectraForTokenClassification,
+    PretrainedConfig,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    XLMRobertaForQuestionAnswering,
+    XLMRobertaForSequenceClassification,
+    XLMRobertaForTokenClassification,
+)
+from ..data_models import BenchmarkConfig, DatasetConfig, ModelConfig
+from ..enums import InferenceBackend, ModelType, TaskGroup
+from ..exceptions import (
+    InvalidBenchmark,
+    InvalidModel,
+    NeedsEnvironmentVariable,
+    NeedsExtraInstalled,
+)
+from ..utils import block_terminal_output, create_model_cache_dir
+from .hf import (
+    HuggingFaceEncoderModel,
+    align_model_and_tokenizer,
+    setup_model_for_question_answering,
+)
+class FreshEncoderModel(HuggingFaceEncoderModel):
+    """A freshly initialised encoder model."""
+    fresh_model = True
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        dataset_config: DatasetConfig,
+        benchmark_config: BenchmarkConfig,
+    ) -> None:
+        """Initialise the model.
+        Args:
+            model_config:
+                The model configuration.
+            dataset_config:
+                The dataset configuration.
+            benchmark_config:
+                The benchmark configuration.
+        """
+        # This is already set when calling `super.__init__`, but we need it to get a
+        # value from `self.model_max_length`, so we set it here as well.
+        self.model_config = model_config
+        model, tokenizer = load_model_and_tokenizer(
+            model_config=model_config,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
+            model_max_length=self.model_max_length,
+        )
+        self._model: PreTrainedModel = model
+        self._tokenizer: PreTrainedTokenizer = tokenizer
+        self._model, self._tokenizer = align_model_and_tokenizer(
+            model=self._model,
+            tokenizer=self._tokenizer,
+            model_max_length=self.model_max_length,
+            raise_errors=benchmark_config.raise_errors,
+        )
+        # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
+        # to call the `__init__` method of the `BenchmarkModule` class.
+        super(HuggingFaceEncoderModel, self).__init__(
+            model_config=model_config,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
+        )
+    @cached_property
+    def num_params(self) -> int:
+        """The number of parameters in the model.
+        Returns:
+            The number of parameters in the model.
+        """
+        match self.model_config.model_id:
+            case "fresh-xlm-roberta-base":
+                return 278_885_778
+            case "fresh-electra-small":
+                return 13_738_755
+            case _:
+                raise NotImplementedError(
+                    f"Number of parameters for model {self.model_config.model_id} is "
+                    "not implemented."
+                )
+    @cached_property
+    def vocab_size(self) -> int:
+        """The vocabulary size of the model.
+        Returns:
+            The vocabulary size of the model.
+        """
+        match self.model_config.model_id:
+            case "fresh-xlm-roberta-base":
+                return 250_002
+            case "fresh-electra-small":
+                return 32_000
+            case _:
+                raise NotImplementedError(
+                    f"Vocabulary size for model {self.model_config.model_id} is not "
+                    "implemented."
+                )
+    @cached_property
+    def model_max_length(self) -> int:
+        """The maximum context length of the model.
+        Returns:
+            The maximum context length of the model.
+        """
+        match self.model_config.model_id:
+            case "fresh-xlm-roberta-base":
+                return 512
+            case "fresh-electra-small":
+                return 128
+            case _:
+                raise NotImplementedError(
+                    f"Maximum context length for model {self.model_config.model_id} is "
+                    "not implemented."
+                )
+    @classmethod
+    def model_exists(
+        cls, model_id: str, benchmark_config: BenchmarkConfig
+    ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
+        """Check if a model exists.
+        Args:
+            model_id:
+                The model ID.
+            benchmark_config:
+                The benchmark configuration.
+        Returns:
+            Whether the model exists, or an error describing why we cannot check
+            whether the model exists.
+        """
+        valid_models = ["fresh-electra-small", "fresh-xlm-roberta-base"]
+        return model_id in valid_models
+    @classmethod
+    def get_model_config(
+        cls, model_id: str, benchmark_config: BenchmarkConfig
+    ) -> ModelConfig:
+        """Fetch the model configuration.
+        Args:
+            model_id:
+                The model ID.
+            benchmark_config:
+                The benchmark configuration.
+        Returns:
+            The model configuration.
+        """
+        return ModelConfig(
+            model_id=model_id,
+            task="fill-mask",
+            languages=list(),
+            revision="main",
+            merge=False,
+            inference_backend=InferenceBackend.TRANSFORMERS,
+            model_type=ModelType.ENCODER,
+            fresh=True,
+            model_cache_dir=create_model_cache_dir(
+                cache_dir=benchmark_config.cache_dir, model_id=model_id
+            ),
+            adapter_base_model_id=None,
+        )
+def load_model_and_tokenizer(
+    model_config: ModelConfig,
+    dataset_config: DatasetConfig,
+    benchmark_config: BenchmarkConfig,
+    model_max_length: int,
+) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
+    """Load the model and tokenizer.
+    Args:
+        model_config:
+            The model configuration.
+        dataset_config:
+            The dataset configuration.
+        benchmark_config:
+            The benchmark configuration.
+        model_max_length:
+            The maximum context length of the model.
+    Returns:
+        The loaded model and tokenizer.
+    """
+    config: "PretrainedConfig"
+    block_terminal_output()
+    # Get the fresh model ID and the corresponding real model ID
+    model_id = model_config.model_id.replace("-", "_")
+    fresh_to_real_model_id_mapping = dict(
+        fresh_xlm_roberta_base="FacebookAI/xlm-roberta-base",
+        fresh_electra_small="google/electra-small-discriminator",
+    )
+    real_model_id = fresh_to_real_model_id_mapping[model_id]
+    match dataset_config.task.task_group:
+        case (
+            TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
+        ):
+            model_cls_mapping = dict(
+                fresh_xlm_roberta_base=XLMRobertaForSequenceClassification,
+                fresh_electra_small=ElectraForSequenceClassification,
+            )
+        case TaskGroup.TOKEN_CLASSIFICATION:
+            model_cls_mapping = dict(
+                fresh_xlm_roberta_base=XLMRobertaForTokenClassification,
+                fresh_electra_small=ElectraForTokenClassification,
+            )
+        case TaskGroup.QUESTION_ANSWERING:
+            model_cls_mapping = dict(
+                fresh_xlm_roberta_base=XLMRobertaForQuestionAnswering,
+                fresh_electra_small=ElectraForQuestionAnswering,
+            )
+        case _:
+            raise InvalidBenchmark(
+                f"Task group {dataset_config.task.task_group} is not "
+                f"supported for model {model_config.model_id}."
+            )
+    model_cls = model_cls_mapping[model_id]
+    config = AutoConfig.from_pretrained(
+        real_model_id,
+        token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+        num_labels=dataset_config.num_labels,
+        id2label=dataset_config.id2label,
+        label2id=dataset_config.label2id,
+        cache_dir=model_config.model_cache_dir,
+        trust_remote_code=benchmark_config.trust_remote_code,
+    )
+    model = model_cls(config)
+    if dataset_config.task.task_group == TaskGroup.QUESTION_ANSWERING:
+        model = setup_model_for_question_answering(model=model)
+    # Load the tokenizer. If the model is a subclass of a RoBERTa model then we
+    # have to add a prefix space to the tokens, by the way the model is constructed
+    prefix_models = ["Roberta", "GPT", "Deberta"]
+    prefix = any(model_type in type(model).__name__ for model_type in prefix_models)
+    try:
+        tokenizer: "PreTrainedTokenizer" = AutoTokenizer.from_pretrained(
+            real_model_id,
+            revision=model_config.revision,
+            token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+            add_prefix_space=prefix,
+            cache_dir=model_config.model_cache_dir,
+            use_fast=True,
+            verbose=False,
+            trust_remote_code=benchmark_config.trust_remote_code,
+        )
+    except (JSONDecodeError, OSError):
+        raise InvalidModel(f"Could not load tokenizer for model {real_model_id!r}.")
+    model, tokenizer = align_model_and_tokenizer(
+        model=model,
+        tokenizer=tokenizer,
+        model_max_length=model_max_length,
+        raise_errors=benchmark_config.raise_errors,
+    )
+    return model, tokenizer