PyPI - EuroEval - Versions diffs - 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl - Mend

EuroEval 15.10.1py3-none-any.whl → 15.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

euroeval/__init__.py +7 -0
euroeval/benchmark_config_factory.py +7 -0
euroeval/benchmark_modules/base.py +29 -29
euroeval/benchmark_modules/fresh.py +31 -19
euroeval/benchmark_modules/hf.py +27 -23
euroeval/benchmark_modules/litellm.py +50 -30
euroeval/benchmark_modules/vllm.py +22 -26
euroeval/benchmarker.py +8 -1
euroeval/callbacks.py +17 -13
euroeval/cli.py +10 -0
euroeval/data_loading.py +10 -5
euroeval/data_models.py +9 -40
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/english.py +13 -4
euroeval/dataset_configs/norwegian.py +8 -0
euroeval/dataset_configs/portuguese.py +74 -0
euroeval/dataset_configs/spanish.py +4 -3
euroeval/finetuning.py +9 -8
euroeval/generation.py +27 -8
euroeval/human_evaluation.py +14 -13
euroeval/languages.py +1 -2
euroeval/metrics.py +452 -0
euroeval/prompt_templates/linguistic_acceptability.py +9 -1
euroeval/prompt_templates/multiple_choice.py +9 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -1
euroeval/prompt_templates/sentiment_classification.py +11 -1
euroeval/prompt_templates/summarization.py +8 -1
euroeval/scores.py +14 -19
euroeval/speed_benchmark.py +6 -7
euroeval/task_group_utils/multiple_choice_classification.py +6 -4
euroeval/task_group_utils/question_answering.py +5 -28
euroeval/task_group_utils/sequence_classification.py +6 -30
euroeval/task_group_utils/text_to_text.py +19 -34
euroeval/task_group_utils/token_classification.py +18 -30
euroeval/tasks.py +11 -136
euroeval/types.py +6 -4
{euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/METADATA +10 -10
euroeval-15.12.0.dist-info/RECORD +63 -0
{euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/licenses/LICENSE +1 -1
euroeval-15.10.1.dist-info/RECORD +0 -61
{euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/WHEEL +0 -0
{euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/entry_points.txt +0 -0

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -13,14 +13,11 @@ from pathlib import Path
 from time import sleep
 import torch
-from datasets import DatasetDict
 from huggingface_hub import snapshot_download
 from pydantic import conlist, create_model
 from tqdm.auto import tqdm
 from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.trainer import Trainer
 from urllib3.exceptions import RequestError
 from ..constants import (
@@ -34,13 +31,7 @@ from ..constants import (
     TASKS_USING_JSON,
     VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
 )
-from ..data_models import (
-    BenchmarkConfig,
-    DatasetConfig,
-    GenerativeModelOutput,
-    ModelConfig,
-    Task,
-)
+from ..data_models import GenerativeModelOutput, ModelConfig
 from ..enums import (
     BatchingPreference,
     GenerativeType,
@@ -94,6 +85,13 @@ if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
 if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
     import ray
+if t.TYPE_CHECKING:
+    from datasets import DatasetDict
+    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers.trainer import Trainer
+    from ..data_models import BenchmarkConfig, DatasetConfig, Task
 logger = logging.getLogger("euroeval")
@@ -106,9 +104,9 @@ class VLLMModel(HuggingFaceEncoderModel):
     def __init__(
         self,
-        model_config: ModelConfig,
-        dataset_config: DatasetConfig,
-        benchmark_config: BenchmarkConfig,
+        model_config: "ModelConfig",
+        dataset_config: "DatasetConfig",
+        benchmark_config: "BenchmarkConfig",
     ) -> None:
         """Initialise the vLLM model.
@@ -129,8 +127,8 @@ class VLLMModel(HuggingFaceEncoderModel):
         model, tokenizer = load_model_and_tokenizer(
             model_config=model_config, benchmark_config=benchmark_config
         )
-        self._model: LLM = model
-        self._tokenizer: PreTrainedTokenizer = tokenizer
+        self._model: "LLM" = model
+        self._tokenizer: "PreTrainedTokenizer" = tokenizer
         self.end_of_reasoning_token = get_end_of_reasoning_token(
             model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
         )
@@ -230,8 +228,8 @@ class VLLMModel(HuggingFaceEncoderModel):
                 )
     def prepare_dataset(
-        self, dataset: DatasetDict, task: Task, itr_idx: int
-    ) -> DatasetDict:
+        self, dataset: "DatasetDict", task: "Task", itr_idx: int
+    ) -> "DatasetDict":
         """Prepare the dataset for the model.
         This includes things like tokenisation.
@@ -293,7 +291,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         return dataset
-    def generate(self, inputs: dict) -> GenerativeModelOutput:
+    def generate(self, inputs: dict) -> "GenerativeModelOutput":
         """Generate outputs from the model.
         Args:
@@ -524,7 +522,7 @@ class VLLMModel(HuggingFaceEncoderModel):
     @classmethod
     def model_exists(
-        cls, model_id: str, benchmark_config: BenchmarkConfig
+        cls, model_id: str, benchmark_config: "BenchmarkConfig"
     ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
         """Check if a model exists.
@@ -558,8 +556,8 @@ class VLLMModel(HuggingFaceEncoderModel):
     @classmethod
     def get_model_config(
-        cls, model_id: str, benchmark_config: BenchmarkConfig
-    ) -> ModelConfig:
+        cls, model_id: str, benchmark_config: "BenchmarkConfig"
+    ) -> "ModelConfig":
         """Fetch the model configuration.
         Args:
@@ -628,8 +626,8 @@ class VLLMModel(HuggingFaceEncoderModel):
 def load_model_and_tokenizer(
-    model_config: ModelConfig, benchmark_config: BenchmarkConfig
-) -> "tuple[LLM, PreTrainedTokenizer]":
+    model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
+) -> tuple["LLM", "PreTrainedTokenizer"]:
     """Load the model and tokenizer.
     Args:
@@ -759,7 +757,7 @@ def load_model_and_tokenizer(
         model = LLM(
             model=model_id,
             tokenizer=model_id,
-            gpu_memory_utilization=0.9,
+            gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
             max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
             download_dir=download_dir,
             trust_remote_code=benchmark_config.trust_remote_code,
@@ -1017,7 +1015,6 @@ def get_custom_stop_tokens(
     """
     candidate_stop_tokens = CUSTOM_STOP_TOKENS
-    # Create a prompt to check if the model uses the reasoning tokens
     prompt = "Hello"
     if tokenizer.chat_template is not None:
         templated_prompt = tokenizer.apply_chat_template(
@@ -1028,7 +1025,6 @@ def get_custom_stop_tokens(
         assert isinstance(templated_prompt, str)
         prompt = templated_prompt
-    # Check that the beginning-of-reasoning token is actually used by the model
     max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
     completion = (
         model.generate(

euroeval/benchmarker.py CHANGED Viewed

@@ -78,6 +78,7 @@ class Benchmarker:
         num_iterations: int = 10,
         api_base: str | None = None,
         api_version: str | None = None,
+        gpu_memory_utilization: float = 0.9,
         debug: bool = False,
         run_with_cli: bool = False,
         only_allow_safetensors: bool = False,
@@ -145,6 +146,11 @@ class Benchmarker:
                 to a model on an inference API. Defaults to None.
             api_version:
                 The version of the API to use. Defaults to None.
+            gpu_memory_utilization:
+                The GPU memory utilization to use for vLLM. Only relevant if the model
+                is generative. A larger value will result in faster evaluation, but at
+                the risk of running out of GPU memory. Only reduce this if you are
+                running out of GPU memory. Defaults to 0.9.
             debug:
                 Whether to output debug information. Defaults to False.
             run_with_cli:
@@ -192,6 +198,7 @@ class Benchmarker:
             num_iterations=num_iterations,
             api_base=api_base,
             api_version=api_version,
+            gpu_memory_utilization=gpu_memory_utilization,
             debug=debug,
             run_with_cli=run_with_cli,
             only_allow_safetensors=only_allow_safetensors,
@@ -767,7 +774,7 @@ class Benchmarker:
                 results = log_scores(
                     dataset_name=dataset_config.pretty_name,
-                    metric_configs=dataset_config.task.metrics,
+                    metrics=dataset_config.task.metrics,
                     scores=scores,
                     model_id=model_config.model_id,
                     model_revision=model_config.revision,

euroeval/callbacks.py CHANGED Viewed

@@ -1,12 +1,16 @@
 """Callbacks for the Hugging Face Trainer."""
 import sys
+import typing as t
 from collections.abc import Sized
-from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
-from transformers.trainer_callback import ProgressCallback, TrainerControl, TrainerState
-from transformers.training_args import TrainingArguments
+from transformers.trainer_callback import ProgressCallback
+if t.TYPE_CHECKING:
+    from torch.utils.data import DataLoader
+    from transformers.trainer_callback import TrainerControl, TrainerState
+    from transformers.training_args import TrainingArguments
 class NeverLeaveProgressCallback(ProgressCallback):
@@ -20,9 +24,9 @@ class NeverLeaveProgressCallback(ProgressCallback):
     def on_train_begin(
         self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
+        args: "TrainingArguments",
+        state: "TrainerState",
+        control: "TrainerControl",
         **kwargs: str,
     ) -> None:
         """Callback actions when training begins."""
@@ -38,9 +42,9 @@ class NeverLeaveProgressCallback(ProgressCallback):
     def on_step_end(
         self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
+        args: "TrainingArguments",
+        state: "TrainerState",
+        control: "TrainerControl",
         **kwargs: str,
     ) -> None:
         """Callback actions when a training step ends."""
@@ -50,10 +54,10 @@ class NeverLeaveProgressCallback(ProgressCallback):
     def on_prediction_step(
         self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        eval_dataloader: DataLoader | None = None,
+        args: "TrainingArguments",
+        state: "TrainerState",
+        control: "TrainerControl",
+        eval_dataloader: "DataLoader | None" = None,
         **kwargs: str,
     ) -> None:
         """Callback actions when a prediction step ends."""

euroeval/cli.py CHANGED Viewed

@@ -186,6 +186,14 @@ from .tasks import get_all_tasks
     help="The version of the API to use. Only relevant if `model` refers to a model on "
     "an inference API.",
 )
+@click.option(
+    "--gpu-memory-utilization",
+    default=0.9,
+    show_default=True,
+    help="The GPU memory utilization to use for vLLM. A larger value will result in "
+    "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
+    "if you are running out of GPU memory. Only relevant if the model is generative.",
+)
 @click.option(
     "--debug/--no-debug",
     default=False,
@@ -223,6 +231,7 @@ def benchmark(
     num_iterations: int,
     api_base: str | None,
     api_version: str | None,
+    gpu_memory_utilization: float,
     debug: bool,
     only_allow_safetensors: bool,
 ) -> None:
@@ -258,6 +267,7 @@ def benchmark(
         num_iterations=num_iterations,
         api_base=api_base,
         api_version=api_version,
+        gpu_memory_utilization=gpu_memory_utilization,
         debug=debug,
         run_with_cli=True,
         only_allow_safetensors=only_allow_safetensors,

euroeval/data_loading.py CHANGED Viewed

@@ -3,23 +3,28 @@
 import logging
 import sys
 import time
+import typing as t
 import requests
-from datasets import Dataset, DatasetDict, load_dataset
+from datasets import DatasetDict, load_dataset
 from datasets.exceptions import DatasetsError
 from huggingface_hub.errors import HfHubHTTPError
 from numpy.random import Generator
-from .data_models import BenchmarkConfig, DatasetConfig
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark
 from .utils import unscramble
+if t.TYPE_CHECKING:
+    from datasets import Dataset
+    from .data_models import BenchmarkConfig, DatasetConfig
 logger = logging.getLogger("euroeval")
 def load_data(
     rng: Generator, dataset_config: "DatasetConfig", benchmark_config: "BenchmarkConfig"
-) -> list[DatasetDict]:
+) -> list["DatasetDict"]:
     """Load the raw bootstrapped datasets.
     Args:
@@ -56,7 +61,7 @@ def load_data(
         dataset["test"] = dataset["test"].select(range(1))
     # Bootstrap the splits
-    bootstrapped_splits: dict[str, list[Dataset]] = dict()
+    bootstrapped_splits: dict[str, list["Dataset"]] = dict()
     for split in ["train", "val", "test"]:
         bootstrap_indices = rng.integers(
             0,
@@ -80,7 +85,7 @@ def load_data(
     return datasets
-def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDict:
+def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDict":
     """Load the raw dataset.
     Args:

euroeval/data_models.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Data models used in EuroEval."""
-import collections.abc as c
 import json
 import pathlib
 import re
@@ -11,48 +10,11 @@ import pydantic
 import torch
 from .enums import Device, InferenceBackend, ModelType, TaskGroup
+from .metrics import Metric
 from .types import ScoreDict
 from .utils import get_package_version
-@dataclass
-class MetricConfig:
-    """Configuration for a metric.
-    Attributes:
-        name:
-            The name of the metric.
-        pretty_name:
-            A longer prettier name for the metric, which allows cases and spaces. Used
-            for logging.
-        huggingface_id:
-            The Hugging Face ID of the metric.
-        results_key:
-            The name of the key used to extract the metric scores from the results
-            dictionary.
-        compute_kwargs:
-            Keyword arguments to pass to the metric's compute function. Defaults to
-            an empty dictionary.
-        postprocessing_fn:
-            A function to apply to the metric scores after they are computed, taking
-            the score to the postprocessed score along with its string representation.
-            Defaults to x -> (100 * x, f"{x:.2%}").
-    """
-    name: str
-    pretty_name: str
-    huggingface_id: str
-    results_key: str
-    compute_kwargs: dict[str, t.Any] = field(default_factory=dict)
-    postprocessing_fn: c.Callable[[float], tuple[float, str]] = field(
-        default_factory=lambda: lambda raw_score: (100 * raw_score, f"{raw_score:.2%}")
-    )
-    def __hash__(self) -> int:
-        """Return a hash of the metric configuration."""
-        return hash(self.name)
 @dataclass
 class Language:
     """A benchmarkable language.
@@ -147,7 +109,7 @@ class Task:
     name: str
     task_group: TaskGroup
     template_dict: dict["Language", "PromptConfig"]
-    metrics: list[MetricConfig]
+    metrics: list[Metric]
     default_num_few_shot_examples: int
     default_max_generated_tokens: int
     default_labels: list[str]
@@ -206,6 +168,11 @@ class BenchmarkConfig:
         api_version:
             The version of the API to use. Only relevant if `model` refers to a model on
             an inference API.
+        gpu_memory_utilization:
+            The GPU memory utilization to use for vLLM. A larger value will result in
+            faster evaluation, but at the risk of running out of GPU memory. Only reduce
+            this if you are running out of GPU memory. Only relevant if the model is
+            generative.
         debug:
             Whether to run the benchmark in debug mode.
         run_with_cli:
@@ -234,6 +201,7 @@ class BenchmarkConfig:
     num_iterations: int
     api_base: str | None
     api_version: str | None
+    gpu_memory_utilization: float
     debug: bool
     run_with_cli: bool
     only_allow_safetensors: bool
@@ -265,6 +233,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
     num_iterations: int
     api_base: str | None
     api_version: str | None
+    gpu_memory_utilization: float
     debug: bool
     run_with_cli: bool
     only_allow_safetensors: bool

euroeval/dataset_configs/__init__.py CHANGED Viewed

@@ -13,6 +13,7 @@ from .german import *  # noqa: F403
 from .icelandic import *  # noqa: F403
 from .italian import *  # noqa: F403
 from .norwegian import *  # noqa: F403
+from .portuguese import *  # noqa: F403
 from .spanish import *  # noqa: F403
 from .swedish import *  # noqa: F403

euroeval/dataset_configs/english.py CHANGED Viewed

@@ -49,10 +49,10 @@ CNN_DAILYMAIL_CONFIG = DatasetConfig(
     languages=[EN],
 )
-MMLU_CONFIG = DatasetConfig(
-    name="mmlu",
-    pretty_name="the truncated version of the English knowledge dataset MMLU",
-    huggingface_id="EuroEval/mmlu-mini",
+LIFE_IN_THE_UK_CONFIG = DatasetConfig(
+    name="life-in-the-uk",
+    pretty_name="the English knowledge dataset Life in the UK",
+    huggingface_id="EuroEval/life-in-the-uk",
     task=KNOW,
     languages=[EN],
 )
@@ -86,3 +86,12 @@ BELEBELE_CONFIG = DatasetConfig(
     languages=[EN],
     unofficial=True,
 )
+MMLU_CONFIG = DatasetConfig(
+    name="mmlu",
+    pretty_name="the truncated version of the English knowledge dataset MMLU",
+    huggingface_id="EuroEval/mmlu-mini",
+    task=KNOW,
+    languages=[EN],
+    unofficial=True,
+)

euroeval/dataset_configs/norwegian.py CHANGED Viewed

@@ -76,6 +76,14 @@ NRK_QUIZ_QA_CONFIG = DatasetConfig(
     languages=[NB, NN, NO],
 )
+IDIOMS_NO_CONFIG = DatasetConfig(
+    name="idioms-no",
+    pretty_name="the Norwegian knowledge dataset Idioms-no",
+    huggingface_id="EuroEval/idioms-no",
+    task=KNOW,
+    languages=[NB, NN, NO],
+)
 NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
     name="nor-common-sense-qa",
     pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "

euroeval/dataset_configs/portuguese.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""All Portuguese dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..languages import PT
+from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, SENT, SUMM
+### Official datasets ###
+SST2_PT_CONFIG = DatasetConfig(
+    name="sst2-pt",
+    pretty_name="the truncated version of the Portuguese sentiment classification "
+    "dataset SST2-pt, translated from the English SST2 dataset",
+    huggingface_id="EuroEval/sst2-pt-mini",
+    task=SENT,
+    languages=[PT],
+    _labels=["positive", "negative"],
+)
+MMLU_PT_CONFIG = DatasetConfig(
+    name="mmlu-pt",
+    pretty_name="the truncated version of the Portuguese knowledge dataset MMLU-pt, "
+    "translated from the English MMLU dataset",
+    huggingface_id="EuroEval/mmlu-pt-mini",
+    task=KNOW,
+    languages=[PT],
+)
+GOLDENSWAG_PT_CONFIG = DatasetConfig(
+    name="goldenswag-pt",
+    pretty_name="the truncated version of the Portuguese common-sense reasoning "
+    "dataset GoldenSwag-pt, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-pt-mini",
+    task=COMMON_SENSE,
+    languages=[PT],
+)
+SCALA_PT = DatasetConfig(
+    name="scala-pt",
+    pretty_name="the Portuguese part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-pt",
+    task=LA,
+    languages=[PT],
+)
+HAREM_CONFIG = DatasetConfig(
+    name="harem",
+    pretty_name="the Portuguese named entity recognition dataset HAREM",
+    huggingface_id="EuroEval/harem",
+    task=NER,
+    languages=[PT],
+)
+PUBLICO_CONFIG = DatasetConfig(
+    name="publico",
+    pretty_name="the truncated version of the Portuguese summarisation dataset Público",
+    huggingface_id="EuroEval/publico-mini",
+    task=SUMM,
+    languages=[PT],
+)
+### Unofficial datasets ###
+BOOLQ_PT_CONFIG = DatasetConfig(
+    name="boolq-pt",
+    pretty_name="the Portuguese multiple choice reading comprehension dataset "
+    "BoolQ-pt, translated from the English BoolQ dataset",
+    huggingface_id="EuroEval/boolq-pt",
+    task=MCRC,
+    languages=[PT],
+)

euroeval/dataset_configs/spanish.py CHANGED Viewed

@@ -8,7 +8,8 @@ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
 SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
     name="sentiment-headlines-es",
-    pretty_name="the truncated version of the Spanish sentiment headlines dataset",
+    pretty_name="the truncated version of the Spanish sentiment classification dataset "
+    "SentimentHeadlines",
     huggingface_id="EuroEval/sentiment-headlines-es",
     task=SENT,
     languages=[ES],
@@ -33,7 +34,7 @@ CONLL_ES_CONFIG = DatasetConfig(
 MLQA_ES_CONFIG = DatasetConfig(
     name="mlqa-es",
-    pretty_name="the Spanish version of the MLQA reading comprehension dataset",
+    pretty_name="the Spanish version of the reading comprehension dataset MLQA",
     huggingface_id="EuroEval/mlqa-es",
     task=RC,
     languages=[ES],
@@ -70,7 +71,7 @@ HELLASWAG_ES_CONFIG = DatasetConfig(
 XQUAD_ES_CONFIG = DatasetConfig(
     name="xquad-es",
-    pretty_name="the Spanish version of the XQuAD reading comprehension dataset",
+    pretty_name="the Spanish version of the reading comprehension dataset XQuAD",
     huggingface_id="EuroEval/xquad-es",
     task=RC,
     languages=[ES],

euroeval/finetuning.py CHANGED Viewed

@@ -5,7 +5,6 @@ import sys
 import typing as t
 import torch
-from datasets import DatasetDict
 from tqdm.auto import tqdm
 from transformers.trainer_callback import (
     EarlyStoppingCallback,
@@ -15,7 +14,6 @@ from transformers.trainer_callback import (
 from transformers.trainer_utils import IntervalStrategy
 from transformers.training_args import OptimizerNames, TrainingArguments
-from .benchmark_modules import BenchmarkModule
 from .callbacks import NeverLeaveProgressCallback
 from .enums import DataType
 from .exceptions import InvalidBenchmark, NaNValueInModelOutput
@@ -28,14 +26,17 @@ from .utils import (
 )
 if t.TYPE_CHECKING:
+    from datasets import DatasetDict
+    from .benchmark_modules import BenchmarkModule
     from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
 logger = logging.getLogger("euroeval")
 def finetune(
-    model: BenchmarkModule,
-    datasets: list[DatasetDict],
+    model: "BenchmarkModule",
+    datasets: list["DatasetDict"],
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
@@ -155,9 +156,9 @@ def finetune(
 def finetune_single_iteration(
-    model: BenchmarkModule | None,
-    dataset: DatasetDict,
-    training_args: TrainingArguments,
+    model: "BenchmarkModule | None",
+    dataset: "DatasetDict",
+    training_args: "TrainingArguments",
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
@@ -254,7 +255,7 @@ def get_training_args(
     iteration_idx: int,
     dtype: DataType,
     batch_size: int | None = None,
-) -> TrainingArguments:
+) -> "TrainingArguments":
     """Get the training arguments for the current iteration.
     Args:

EuroEval 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl

EuroEval 15.10.1py3-none-any.whl → 15.12.0py3-none-any.whl