PyPI - EuroEval - Versions diffs - 15.2.0__py3-none-any.whl - Mend

EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show

euroeval/__init__.py +72 -0
euroeval/benchmark_config_factory.py +358 -0
euroeval/benchmark_modules/__init__.py +7 -0
euroeval/benchmark_modules/base.py +354 -0
euroeval/benchmark_modules/fresh.py +286 -0
euroeval/benchmark_modules/hf.py +1185 -0
euroeval/benchmark_modules/litellm.py +905 -0
euroeval/benchmark_modules/vllm.py +1171 -0
euroeval/benchmarker.py +1074 -0
euroeval/callbacks.py +72 -0
euroeval/cli.py +281 -0
euroeval/constants.py +50 -0
euroeval/data_loading.py +96 -0
euroeval/data_models.py +474 -0
euroeval/dataset_configs.py +2001 -0
euroeval/enums.py +144 -0
euroeval/exceptions.py +191 -0
euroeval/finetuning.py +324 -0
euroeval/generation.py +296 -0
euroeval/human_evaluation.py +737 -0
euroeval/languages.py +200 -0
euroeval/model_cache.py +253 -0
euroeval/model_config.py +77 -0
euroeval/model_loading.py +78 -0
euroeval/scores.py +90 -0
euroeval/speed_benchmark.py +124 -0
euroeval/task_utils/__init__.py +1 -0
euroeval/task_utils/multiple_choice_classification.py +176 -0
euroeval/task_utils/question_answering.py +698 -0
euroeval/task_utils/sequence_classification.py +237 -0
euroeval/task_utils/text_to_text.py +150 -0
euroeval/task_utils/token_classification.py +464 -0
euroeval/tasks.py +202 -0
euroeval/types.py +97 -0
euroeval/utils.py +574 -0
euroeval-15.2.0.dist-info/METADATA +234 -0
euroeval-15.2.0.dist-info/RECORD +40 -0
euroeval-15.2.0.dist-info/WHEEL +4 -0
euroeval-15.2.0.dist-info/entry_points.txt +4 -0
euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0

euroeval/enums.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""Enums used in the project."""
+from enum import Enum, auto
+class AutoStrEnum(str, Enum):
+    """StrEnum where auto() returns the field name in lower case."""
+    @staticmethod
+    def _generate_next_value_(
+        name: str, start: int, count: int, last_values: list
+    ) -> str:
+        return name.lower()
+class Device(AutoStrEnum):
+    """The compute device to use for the evaluation.
+    Attributes:
+        CPU:
+            CPU device.
+        MPS:
+            MPS GPU, used in M-series MacBooks.
+        CUDA:
+            CUDA GPU, used with NVIDIA GPUs.
+    """
+    CPU = auto()
+    MPS = auto()
+    CUDA = auto()
+class InferenceBackend(AutoStrEnum):
+    """The backend used for model inference.
+    Attributes:
+        TRANSFORMERS:
+            Hugging Face `transformers` library.
+        VLLM:
+            VLLM library.
+        LITELLM:
+            LiteLLM library.
+        NONE:
+            No inference backend used (e.g., for human evaluation).
+    """
+    TRANSFORMERS = auto()
+    VLLM = auto()
+    LITELLM = auto()
+    NONE = auto()
+class ModelType(AutoStrEnum):
+    """The type of a model.
+    Attributes:
+        ENCODER:
+            An encoder (i.e., BERT-style) model.
+        GENERATIVE:
+            A generative model. Can be either decoder or encoder-decoder (aka seq2seq).
+        HUMAN:
+            Human evaluator.
+    """
+    ENCODER = auto()
+    GENERATIVE = auto()
+    HUMAN = auto()
+class GenerativeType(AutoStrEnum):
+    """The type of a generative model.
+    Attributes:
+        BASE:
+            A base (i.e., pretrained) generative model.
+        INSTRUCTION_TUNED:
+            An instruction-tuned generative model.
+        REASONING:
+            A generative reasoning model.
+    """
+    BASE = auto()
+    INSTRUCTION_TUNED = auto()
+    REASONING = auto()
+class DataType(AutoStrEnum):
+    """The data type of the model weights.
+    Attributes:
+        FP32:
+            32-bit floating point.
+        FP16:
+            16-bit floating point.
+        BF16:
+            16-bit bfloat.
+    """
+    FP32 = auto()
+    FP16 = auto()
+    BF16 = auto()
+class BatchingPreference(AutoStrEnum):
+    """The preference for batching.
+    Attributes:
+        NO_PREFERENCE:
+            No preference for batching.
+        SINGLE_SAMPLE:
+            Single sample batching.
+        ALL_AT_ONCE:
+            All samples at once batching.
+    """
+    NO_PREFERENCE = auto()
+    SINGLE_SAMPLE = auto()
+    ALL_AT_ONCE = auto()
+class TaskGroup(AutoStrEnum):
+    """The overall task group of a task.
+    Attributes:
+        SEQUENCE_CLASSIFICATION:
+            Classification of documents.
+        MULTIPLE_CHOICE_CLASSIFICATION:
+            Classification of documents with multiple-choice options.
+        TOKEN_CLASSIFICATION:
+            Token-level classification.
+        QUESTION_ANSWERING:
+            Extractive question answering.
+        TEXT_TO_TEXT:
+            Text-to-text generation.
+        SPEED:
+            Speed benchmark.
+    """
+    SEQUENCE_CLASSIFICATION = auto()
+    MULTIPLE_CHOICE_CLASSIFICATION = auto()
+    TOKEN_CLASSIFICATION = auto()
+    QUESTION_ANSWERING = auto()
+    TEXT_TO_TEXT = auto()
+    SPEED = auto()

euroeval/exceptions.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""Exceptions to used by other functions."""
+class InvalidBenchmark(Exception):
+    """The (model, dataset) combination cannot be benchmarked."""
+    def __init__(
+        self, message: str = "This model cannot be benchmarked on the given dataset."
+    ) -> None:
+        """Initialize the exception.
+        Args:
+            message:
+                The message to display.
+        """
+        self.message = message
+        super().__init__(self.message)
+class InvalidModel(Exception):
+    """The model cannot be benchmarked on any datasets."""
+    def __init__(
+        self, message: str = "The model cannot be benchmarked on any datasets."
+    ) -> None:
+        """Initialize the exception.
+        Args:
+            message:
+                The message to display.
+        """
+        self.message = message
+        super().__init__(self.message)
+class HuggingFaceHubDown(Exception):
+    """The Hugging Face Hub seems to be down."""
+    def __init__(
+        self, message: str = "The Hugging Face Hub is currently down."
+    ) -> None:
+        """Initialize the exception.
+        Args:
+            message:
+                The message to display.
+        """
+        self.message = message
+        super().__init__(self.message)
+class NoInternetConnection(Exception):
+    """There seems to be no internet connection."""
+    def __init__(
+        self, message: str = "There is currently no internet connection."
+    ) -> None:
+        """Initialize the exception.
+        Args:
+            message:
+                The message to display.
+        """
+        self.message = message
+        super().__init__(self.message)
+class NaNValueInModelOutput(Exception):
+    """There is a NaN value in the model output."""
+    def __init__(
+        self, message: str = "There is a NaN value in the model output."
+    ) -> None:
+        """Initialize the exception.
+        Args:
+            message:
+                The message to display.
+        """
+        self.message = message
+        super().__init__(self.message)
+class FlashAttentionNotInstalled(Exception):
+    """The `flash-attn` package has not been installed."""
+    def __init__(
+        self,
+        message: str = (
+            "The model you are trying to load requires Flash Attention. To use Flash "
+            "Attention, please install the `flash-attn` package, which can be done by "
+            "running `pip install -U wheel && FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE "
+            "pip install flash-attn --no-build-isolation`."
+        ),
+    ) -> None:
+        """Initialize the exception.
+        Args:
+            message:
+                The message to display.
+        """
+        self.message = message
+        super().__init__(self.message)
+class NeedsExtraInstalled(InvalidModel):
+    """The evaluation requires extra to be installed."""
+    def __init__(self, extra: str) -> None:
+        """Initialize the exception.
+        Args:
+            extra:
+                The extra that needs to be installed.
+        """
+        self.extra = extra
+        self.message = (
+            f"The model you are trying to load requires the `{extra}` extra to be "
+            f"installed. To install the `{extra}` extra, please run `pip install "
+            f"euroeval[{extra}]` or `pip install euroeval[all]`."
+        )
+        super().__init__(self.message)
+class NeedsManualDependency(InvalidModel):
+    """The evaluation requires a dependency to be manually installed."""
+    def __init__(self, package: str) -> None:
+        """Initialize the exception.
+        Args:
+            package:
+                The package that needs to be manually installed.
+        """
+        self.package = package
+        self.message = (
+            f"The model you are trying to load requires the `{package}` package to be "
+            f"installed - please run `pip install {package}` and try again."
+        )
+        super().__init__(self.message)
+class NeedsAdditionalArgument(InvalidModel):
+    """The evaluation requires additional arguments to the `euroeval` command."""
+    def __init__(
+        self, cli_argument: str, script_argument: str, run_with_cli: bool
+    ) -> None:
+        """Initialize the exception.
+        Args:
+            cli_argument:
+                The argument that needs to be passed to the `euroeval` command.
+            script_argument:
+                The argument that needs to be passed to the `Benchmarker` class.
+            run_with_cli:
+                Whether the benchmark is being run with the CLI.
+        """
+        self.cli_argument = cli_argument
+        self.script_argument = script_argument
+        if run_with_cli:
+            self.message = (
+                f"The model you are trying to load requires the `{cli_argument}` "
+                "argument to be passed to the `euroeval` command. Please pass the "
+                "argument and try again."
+            )
+        else:
+            self.message = (
+                f"The model you are trying to load requires the `{script_argument}` "
+                "argument  to be passed to the `Benchmarker` class. Please pass the "
+                "argument and try again."
+            )
+        super().__init__(self.message)
+class NeedsEnvironmentVariable(InvalidModel):
+    """The evaluation requires an environment variable to be set."""
+    def __init__(self, env_var: str) -> None:
+        """Initialize the exception.
+        Args:
+            env_var:
+                The environment variable that needs to be set.
+        """
+        self.env_var = env_var
+        self.message = (
+            f"The model you are trying to load requires the `{env_var}` environment "
+            "variable to be set. Please set the environment variable and try again."
+        )
+        super().__init__(self.message)

euroeval/finetuning.py ADDED Viewed

@@ -0,0 +1,324 @@
+"""Functions related to the finetuning of models."""
+import logging
+import sys
+import typing as t
+import torch
+from datasets import DatasetDict
+from tqdm.auto import tqdm
+from transformers import (
+    EarlyStoppingCallback,
+    IntervalStrategy,
+    PrinterCallback,
+    ProgressCallback,
+    TrainingArguments,
+)
+from transformers.trainer import OptimizerNames
+from .benchmark_modules import BenchmarkModule
+from .callbacks import NeverLeaveProgressCallback
+from .enums import DataType
+from .exceptions import InvalidBenchmark, NaNValueInModelOutput
+from .model_loading import load_model
+from .utils import (
+    block_terminal_output,
+    clear_memory,
+    enforce_reproducibility,
+    log_once,
+)
+if t.TYPE_CHECKING:
+    from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
+logger = logging.getLogger("euroeval")
+def finetune(
+    model: BenchmarkModule,
+    datasets: list[DatasetDict],
+    model_config: "ModelConfig",
+    dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
+) -> list[dict[str, float]]:
+    """Evaluate a model on a dataset through finetuning.
+    Args:
+        model:
+            The model to evaluate.
+        datasets:
+            The datasets to use for training and evaluation.
+        model_config:
+            The configuration of the model.
+        dataset_config:
+            The dataset configuration.
+        benchmark_config:
+            The benchmark configuration.
+    Returns:
+        A list of dicts containing the scores for each metric for each iteration.
+    """
+    # Set the data type to use for the model weights
+    using_cuda = benchmark_config.device == torch.device("cuda")
+    if using_cuda and torch.cuda.is_bf16_supported():
+        dtype = DataType.BF16
+    elif using_cuda:
+        dtype = DataType.FP16
+    else:
+        dtype = DataType.FP32
+    # TEMP
+    dtype = DataType.FP32
+    bs: int = benchmark_config.batch_size
+    scores: list[dict[str, float]] = list()
+    for idx in tqdm(
+        iterable=range(benchmark_config.num_iterations),
+        desc="Benchmarking",
+        disable=not benchmark_config.progress_bar,
+    ):
+        # Set variable that tracks whether we need to initialize new models in
+        # the single iteration call
+        model_already_initialized = idx == 0
+        # Run a loop here to deal with automatic reduction of batch size
+        while True:
+            # Clear GPU memory
+            if not model_already_initialized:
+                try:
+                    del model
+                except UnboundLocalError:
+                    pass
+                clear_memory()
+            try:
+                # Re-block terminal output, as it gets unblocked by the `transformers`
+                # package before training
+                block_terminal_output()
+                training_args = get_training_args(
+                    benchmark_config=benchmark_config,
+                    model_config=model_config,
+                    iteration_idx=idx,
+                    dtype=dtype,
+                    batch_size=bs,
+                )
+                itr_scores = finetune_single_iteration(
+                    model=model if model_already_initialized else None,
+                    dataset=datasets[idx],
+                    iteration_idx=idx,
+                    training_args=training_args,
+                    model_config=model_config,
+                    dataset_config=dataset_config,
+                    benchmark_config=benchmark_config,
+                )
+                scores.append(itr_scores)
+                logger.debug(f"Test scores for iteration {idx}: {itr_scores}")
+                break
+            # NaN values can appear in the model output when using mixed precision, as
+            # the hidden states get overflowed. In this case we try to disable mixed
+            # precision and try again.
+            except NaNValueInModelOutput:
+                if dtype != DataType.FP32:
+                    dtype = DataType.FP32
+                    model_already_initialized = False
+                    logger.debug(
+                        "NaN value detected in model outputs while using mixed "
+                        "precision. Retrying with full fp32 precision."
+                    )
+                else:
+                    raise InvalidBenchmark(
+                        "NaN value detected in model outputs, even with mixed "
+                        "precision disabled."
+                    )
+            except Exception as e:
+                if "CUDA" not in str(e) and "out of memory" not in str(e):
+                    raise InvalidBenchmark(str(e))
+                if bs <= 1:
+                    msg = "Could not benchmark the model, even with a batch size of 1!"
+                    if "MPS" in str(e):
+                        msg += (
+                            " As you are using MPS, you can try running the evaluation "
+                            "with the `PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0` "
+                            "environment variable set, as this removes the upper bound "
+                            "on the memory usage."
+                        )
+                    raise InvalidBenchmark(msg)
+                model_already_initialized = False
+                bs //= 2
+                logger.debug(f"Reduced batch size to {bs}")
+    return scores
+def finetune_single_iteration(
+    model: BenchmarkModule | None,
+    dataset: DatasetDict,
+    iteration_idx: int,
+    training_args: TrainingArguments,
+    model_config: "ModelConfig",
+    dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
+) -> dict[str, float]:
+    """Run a single iteration of a benchmark.
+    Args:
+        model:
+            The model to use in the benchmark. If None then a new model will be loaded.
+        dataset:
+            The dataset to use for training and evaluation.
+        iteration_idx:
+            The index of the iteration.
+        training_args:
+            The training arguments.
+        model_config:
+            The model configuration.
+        dataset_config:
+            The dataset configuration.
+        benchmark_config:
+            The benchmark configuration.
+    Returns:
+        The scores for the test dataset.
+    """
+    # Set random seeds to enforce reproducibility of the randomly initialised weights
+    enforce_reproducibility(seed=training_args.seed)
+    if model is None:
+        model = load_model(
+            model_config=model_config,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
+        )
+    trainer = model.trainer_class(
+        model=model.get_pytorch_module(),
+        processing_class=model.get_tokenizer(),
+        args=training_args,
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["val"],
+        compute_metrics=model.compute_metrics,
+        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
+        data_collator=model.data_collator,
+    )
+    if not benchmark_config.verbose:
+        def no_logging(logs: dict[str, float]) -> None:
+            return
+        trainer.log = no_logging
+    # Re-block terminal output, as it gets unblocked by the `transformers`
+    # package before training
+    block_terminal_output()
+    # Sort out callbacks. We remove the callbacks that are producing unnecessary
+    # output, to avoid cluttering the terminal output
+    if not benchmark_config.verbose:
+        trainer.remove_callback(PrinterCallback)
+    trainer.remove_callback(ProgressCallback)
+    if benchmark_config.progress_bar:
+        trainer.add_callback(NeverLeaveProgressCallback)
+    try:
+        trainer.train()
+        with torch.inference_mode():
+            try:
+                test_scores = trainer.evaluate(
+                    eval_dataset=dataset["test"],
+                    orig_eval_dataset=dataset["original_test"],
+                    metric_key_prefix="test",
+                )
+            except TypeError:
+                test_scores = trainer.evaluate(
+                    eval_dataset=dataset["test"], metric_key_prefix="test"
+                )
+        return test_scores
+    except NaNValueInModelOutput as e:
+        del trainer
+        del model
+        clear_memory()
+        raise e
+    except (RuntimeError, ValueError, IndexError) as e:
+        raise InvalidBenchmark(str(e))
+def get_training_args(
+    benchmark_config: "BenchmarkConfig",
+    model_config: "ModelConfig",
+    iteration_idx: int,
+    dtype: DataType,
+    batch_size: int | None = None,
+) -> TrainingArguments:
+    """Get the training arguments for the current iteration.
+    Args:
+        benchmark_config:
+            The benchmark configuration.
+        model_config:
+            The model configuration.
+        iteration_idx:
+            The index of the current iteration. This is only used to generate a
+            unique random seed for the current iteration.
+        dtype:
+            The data type to use for the model weights.
+        batch_size:
+            The batch size to use for the current iteration, or None if the batch size
+            in the benchmark config should be used.
+    Returns:
+        The training arguments for the current iteration.
+    """
+    log_once(message=f"Using {dtype} data type.", level=logging.DEBUG)
+    if benchmark_config.verbose:
+        logging_strategy = IntervalStrategy.STEPS
+    else:
+        logging_strategy = IntervalStrategy.NO
+    if batch_size is None:
+        batch_size = benchmark_config.batch_size
+    training_args = TrainingArguments(
+        output_dir=model_config.model_cache_dir,
+        evaluation_strategy=IntervalStrategy.STEPS,
+        logging_strategy=logging_strategy,
+        save_strategy=IntervalStrategy.STEPS,
+        eval_steps=30,
+        logging_steps=30,
+        save_steps=30,
+        max_steps=1 if hasattr(sys, "_called_from_test") else 10_000,
+        use_cpu=benchmark_config.device == torch.device("cpu"),
+        report_to=[],
+        save_total_limit=1,
+        per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=batch_size,
+        learning_rate=2e-5,
+        warmup_ratio=0.01,
+        gradient_accumulation_steps=32 // batch_size,
+        load_best_model_at_end=True,
+        optim=OptimizerNames.ADAMW_TORCH,
+        seed=4242 + iteration_idx,
+        fp16=dtype == DataType.FP16,
+        bf16=dtype == DataType.BF16,
+        disable_tqdm=not benchmark_config.progress_bar,
+        ddp_find_unused_parameters=False,
+        save_safetensors=False,
+    )
+    # TEMP: Use only 1 GPU for now for finetuning
+    if benchmark_config.device == torch.device("cuda"):
+        training_args._n_gpu = 1
+    return training_args