PyPI - EuroEval - Versions diffs - 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl - Mend

EuroEval 15.10.1py3-none-any.whl → 15.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

euroeval/__init__.py +7 -0
euroeval/benchmark_config_factory.py +7 -0
euroeval/benchmark_modules/base.py +29 -29
euroeval/benchmark_modules/fresh.py +31 -19
euroeval/benchmark_modules/hf.py +27 -23
euroeval/benchmark_modules/litellm.py +50 -30
euroeval/benchmark_modules/vllm.py +22 -26
euroeval/benchmarker.py +8 -1
euroeval/callbacks.py +17 -13
euroeval/cli.py +10 -0
euroeval/data_loading.py +10 -5
euroeval/data_models.py +9 -40
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/english.py +13 -4
euroeval/dataset_configs/norwegian.py +8 -0
euroeval/dataset_configs/portuguese.py +74 -0
euroeval/dataset_configs/spanish.py +4 -3
euroeval/finetuning.py +9 -8
euroeval/generation.py +27 -8
euroeval/human_evaluation.py +14 -13
euroeval/languages.py +1 -2
euroeval/metrics.py +452 -0
euroeval/prompt_templates/linguistic_acceptability.py +9 -1
euroeval/prompt_templates/multiple_choice.py +9 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -1
euroeval/prompt_templates/sentiment_classification.py +11 -1
euroeval/prompt_templates/summarization.py +8 -1
euroeval/scores.py +14 -19
euroeval/speed_benchmark.py +6 -7
euroeval/task_group_utils/multiple_choice_classification.py +6 -4
euroeval/task_group_utils/question_answering.py +5 -28
euroeval/task_group_utils/sequence_classification.py +6 -30
euroeval/task_group_utils/text_to_text.py +19 -34
euroeval/task_group_utils/token_classification.py +18 -30
euroeval/tasks.py +11 -136
euroeval/types.py +6 -4
{euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/METADATA +10 -10
euroeval-15.12.0.dist-info/RECORD +63 -0
{euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/licenses/LICENSE +1 -1
euroeval-15.10.1.dist-info/RECORD +0 -61
{euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/WHEEL +0 -0
{euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/entry_points.txt +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -86,6 +86,13 @@ os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+# Avoid the "Unclosed client session" error when evaluating Ollama models with LiteLLM.
+# The error comes from the `aiohttp` package, and this environment variable forces the
+# use of `httpx` instead.
+# Link: https://github.com/BerriAI/litellm/issues/11657#issuecomment-3038984975
+os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
 # Use older version v0 of vLLM, as the newer one requires XGrammar as decoding backend,
 # but XGrammar does not support having a maximal amount of elements in lists
 os.environ["VLLM_USE_V1"] = "0"

euroeval/benchmark_config_factory.py CHANGED Viewed

@@ -42,6 +42,7 @@ def build_benchmark_config(
     num_iterations: int,
     api_base: str | None,
     api_version: str | None,
+    gpu_memory_utilization: float,
     debug: bool,
     run_with_cli: bool,
     only_allow_safetensors: bool,
@@ -102,6 +103,11 @@ def build_benchmark_config(
             model on an inference API.
         api_version:
             The version of the API to use for a given inference API.
+        gpu_memory_utilization:
+            The GPU memory utilization to use for vLLM. A larger value will result in
+            faster evaluation, but at the risk of running out of GPU memory. Only reduce
+            this if you are running out of GPU memory. Only relevant if the model is
+            generative.
         debug:
             Whether to run the benchmark in debug mode.
         run_with_cli:
@@ -154,6 +160,7 @@ def build_benchmark_config(
         num_iterations=num_iterations,
         api_base=api_base,
         api_version=api_version,
+        gpu_memory_utilization=gpu_memory_utilization,
         debug=debug,
         run_with_cli=run_with_cli,
         only_allow_safetensors=only_allow_safetensors,

euroeval/benchmark_modules/base.py CHANGED Viewed

@@ -10,17 +10,8 @@ from functools import cached_property, partial
 from datasets import DatasetDict
 from torch import nn
 from tqdm.auto import tqdm
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.trainer import Trainer
-from ..data_models import (
-    BenchmarkConfig,
-    DatasetConfig,
-    GenerativeModelOutput,
-    ModelConfig,
-    Task,
-)
-from ..enums import BatchingPreference, GenerativeType, TaskGroup
+from ..enums import TaskGroup
 from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
 from ..task_group_utils import (
     question_answering,
@@ -28,9 +19,22 @@ from ..task_group_utils import (
     text_to_text,
     token_classification,
 )
-from ..types import ComputeMetricsFunction, ExtractLabelsFunction
 from ..utils import log_once
+if t.TYPE_CHECKING:
+    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers.trainer import Trainer
+    from ..data_models import (
+        BenchmarkConfig,
+        DatasetConfig,
+        GenerativeModelOutput,
+        ModelConfig,
+        Task,
+    )
+    from ..enums import BatchingPreference, GenerativeType
+    from ..types import ComputeMetricsFunction, ExtractLabelsFunction
 logger = logging.getLogger("euroeval")
@@ -49,14 +53,14 @@ class BenchmarkModule(ABC):
     """
     fresh_model: bool
-    batching_preference: BatchingPreference
+    batching_preference: "BatchingPreference"
     high_priority: bool
     def __init__(
         self,
-        model_config: ModelConfig,
-        dataset_config: DatasetConfig,
-        benchmark_config: BenchmarkConfig,
+        model_config: "ModelConfig",
+        dataset_config: "DatasetConfig",
+        benchmark_config: "BenchmarkConfig",
     ) -> None:
         """Initialise the benchmark module.
@@ -138,7 +142,7 @@ class BenchmarkModule(ABC):
     @property
     @abstractmethod
-    def generative_type(self) -> GenerativeType | None:
+    def generative_type(self) -> "GenerativeType | None":
         """Get the generative type of the model.
         Returns:
@@ -177,7 +181,7 @@ class BenchmarkModule(ABC):
         ...
     @property
-    def compute_metrics(self) -> ComputeMetricsFunction:
+    def compute_metrics(self) -> "ComputeMetricsFunction":
         """The function used to compute the metrics.
         Returns:
@@ -188,13 +192,11 @@ class BenchmarkModule(ABC):
                 return partial(
                     sequence_classification.compute_metrics,
                     dataset_config=self.dataset_config,
-                    benchmark_config=self.benchmark_config,
                 )
             case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
                 return partial(
                     sequence_classification.compute_metrics,
                     dataset_config=self.dataset_config,
-                    benchmark_config=self.benchmark_config,
                 )
             case TaskGroup.TEXT_TO_TEXT:
                 return partial(
@@ -207,13 +209,11 @@ class BenchmarkModule(ABC):
                     token_classification.compute_metrics,
                     has_misc_tags=self.buffer.get("has_misc_tags", True),
                     dataset_config=self.dataset_config,
-                    benchmark_config=self.benchmark_config,
                 )
             case TaskGroup.QUESTION_ANSWERING:
                 return partial(
                     question_answering.compute_metrics,
                     dataset_config=self.dataset_config,
-                    benchmark_config=self.benchmark_config,
                 )
             case _:
                 raise NotImplementedError(
@@ -222,7 +222,7 @@ class BenchmarkModule(ABC):
     @property
     @abstractmethod
-    def extract_labels_from_generation(self) -> ExtractLabelsFunction:
+    def extract_labels_from_generation(self) -> "ExtractLabelsFunction":
         """The function used to extract the labels from the generated output.
         Returns:
@@ -241,7 +241,7 @@ class BenchmarkModule(ABC):
         ...
     def prepare_datasets(
-        self, datasets: list[DatasetDict], task: Task
+        self, datasets: list[DatasetDict], task: "Task"
     ) -> list[DatasetDict]:
         """Prepare the datasets for the model.
@@ -283,7 +283,7 @@ class BenchmarkModule(ABC):
     @abstractmethod
     def prepare_dataset(
-        self, dataset: DatasetDict, task: Task, itr_idx: int
+        self, dataset: DatasetDict, task: "Task", itr_idx: int
     ) -> DatasetDict:
         """Prepare the dataset for the model.
@@ -302,7 +302,7 @@ class BenchmarkModule(ABC):
         """
         ...
-    def generate(self, inputs: dict) -> GenerativeModelOutput:
+    def generate(self, inputs: dict) -> "GenerativeModelOutput":
         """Generate outputs from the model.
         Args:
@@ -320,7 +320,7 @@ class BenchmarkModule(ABC):
     @classmethod
     @abstractmethod
     def model_exists(
-        cls, model_id: str, benchmark_config: BenchmarkConfig
+        cls, model_id: str, benchmark_config: "BenchmarkConfig"
     ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
         """Check if a model exists.
@@ -339,8 +339,8 @@ class BenchmarkModule(ABC):
     @classmethod
     @abstractmethod
     def get_model_config(
-        cls, model_id: str, benchmark_config: BenchmarkConfig
-    ) -> ModelConfig:
+        cls, model_id: str, benchmark_config: "BenchmarkConfig"
+    ) -> "ModelConfig":
         """Fetch the model configuration.
         Args:

euroeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -1,11 +1,10 @@
 """Freshly initialised encoder models."""
 import os
+import typing as t
 from functools import cached_property
 from json import JSONDecodeError
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_utils import PreTrainedModel
 from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.models.electra import (
@@ -18,9 +17,8 @@ from transformers.models.xlm_roberta import (
     XLMRobertaForSequenceClassification,
     XLMRobertaForTokenClassification,
 )
-from transformers.tokenization_utils import PreTrainedTokenizer
-from ..data_models import BenchmarkConfig, DatasetConfig, ModelConfig
+from ..data_models import ModelConfig
 from ..enums import InferenceBackend, ModelType, TaskGroup
 from ..exceptions import (
     InvalidBenchmark,
@@ -35,6 +33,13 @@ from .hf import (
     setup_model_for_question_answering,
 )
+if t.TYPE_CHECKING:
+    from transformers.configuration_utils import PretrainedConfig
+    from transformers.modeling_utils import PreTrainedModel
+    from transformers.tokenization_utils import PreTrainedTokenizer
+    from ..data_models import BenchmarkConfig, DatasetConfig
 class FreshEncoderModel(HuggingFaceEncoderModel):
     """A freshly initialised encoder model."""
@@ -43,9 +48,9 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
     def __init__(
         self,
-        model_config: ModelConfig,
-        dataset_config: DatasetConfig,
-        benchmark_config: BenchmarkConfig,
+        model_config: "ModelConfig",
+        dataset_config: "DatasetConfig",
+        benchmark_config: "BenchmarkConfig",
     ) -> None:
         """Initialise the model.
@@ -67,8 +72,8 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
             benchmark_config=benchmark_config,
             model_max_length=self.model_max_length,
         )
-        self._model: PreTrainedModel = model
-        self._tokenizer: PreTrainedTokenizer = tokenizer
+        self._model: "PreTrainedModel" = model
+        self._tokenizer: "PreTrainedTokenizer" = tokenizer
         self._model, self._tokenizer = align_model_and_tokenizer(
             model=self._model,
@@ -141,7 +146,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
     @classmethod
     def model_exists(
-        cls, model_id: str, benchmark_config: BenchmarkConfig
+        cls, model_id: str, benchmark_config: "BenchmarkConfig"
     ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
         """Check if a model exists.
@@ -160,8 +165,8 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
     @classmethod
     def get_model_config(
-        cls, model_id: str, benchmark_config: BenchmarkConfig
-    ) -> ModelConfig:
+        cls, model_id: str, benchmark_config: "BenchmarkConfig"
+    ) -> "ModelConfig":
         """Fetch the model configuration.
         Args:
@@ -190,11 +195,11 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
 def load_model_and_tokenizer(
-    model_config: ModelConfig,
-    dataset_config: DatasetConfig,
-    benchmark_config: BenchmarkConfig,
+    model_config: "ModelConfig",
+    dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
     model_max_length: int,
-) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
+) -> "tuple[PreTrainedModel, PreTrainedTokenizer]":
     """Load the model and tokenizer.
     Args:
@@ -248,12 +253,19 @@ def load_model_and_tokenizer(
             )
     model_cls = model_cls_mapping[model_id]
+    # Special case where there is a mismatch between the labels during training and
+    # testing
+    if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
+        id2label = {0: "0", 1: "1"}
+    else:
+        id2label = dataset_config.id2label
     config = AutoConfig.from_pretrained(
         real_model_id,
         token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
-        num_labels=dataset_config.num_labels,
-        id2label=dataset_config.id2label,
-        label2id=dataset_config.label2id,
+        num_labels=len(id2label),
+        id2label=id2label,
+        label2id={label: id_ for id_, label in id2label.items()},
         cache_dir=model_config.model_cache_dir,
         trust_remote_code=benchmark_config.trust_remote_code,
     )

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -24,7 +24,6 @@ from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
 from peft import PeftConfig
 from requests.exceptions import RequestException
 from torch import nn
-from transformers.configuration_utils import PretrainedConfig
 from transformers.data.data_collator import (
     DataCollatorForTokenClassification,
     DataCollatorWithPadding,
@@ -33,8 +32,6 @@ from transformers.modelcard import TASK_MAPPING
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.tokenization_utils_base import BatchEncoding
 from transformers.trainer import Trainer
 from urllib3.exceptions import RequestError
@@ -45,7 +42,7 @@ from ..constants import (
     MAX_CONTEXT_LENGTH,
     MERGE_TAGS,
 )
-from ..data_models import BenchmarkConfig, DatasetConfig, HFModelInfo, ModelConfig, Task
+from ..data_models import HFModelInfo, ModelConfig
 from ..enums import (
     BatchingPreference,
     GenerativeType,
@@ -67,7 +64,6 @@ from ..task_group_utils import (
     token_classification,
 )
 from ..tokenization_utils import get_bos_token, get_eos_token
-from ..types import ExtractLabelsFunction
 from ..utils import (
     block_terminal_output,
     create_model_cache_dir,
@@ -77,6 +73,14 @@ from ..utils import (
 )
 from .base import BenchmarkModule
+if t.TYPE_CHECKING:
+    from transformers.configuration_utils import PretrainedConfig
+    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers.tokenization_utils_base import BatchEncoding
+    from ..data_models import BenchmarkConfig, DatasetConfig, Task
+    from ..types import ExtractLabelsFunction
 logger = logging.getLogger("euroeval")
@@ -89,9 +93,9 @@ class HuggingFaceEncoderModel(BenchmarkModule):
     def __init__(
         self,
-        model_config: ModelConfig,
-        dataset_config: DatasetConfig,
-        benchmark_config: BenchmarkConfig,
+        model_config: "ModelConfig",
+        dataset_config: "DatasetConfig",
+        benchmark_config: "BenchmarkConfig",
     ) -> None:
         """Initialise the model.
@@ -108,8 +112,8 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
         )
-        self._model: PreTrainedModel = model
-        self._tokenizer: PreTrainedTokenizer = tokenizer
+        self._model: "PreTrainedModel" = model
+        self._tokenizer: "PreTrainedTokenizer" = tokenizer
         self._model, self._tokenizer = align_model_and_tokenizer(
             model=self._model,
@@ -291,7 +295,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         return None
     @property
-    def extract_labels_from_generation(self) -> ExtractLabelsFunction:
+    def extract_labels_from_generation(self) -> "ExtractLabelsFunction":
         """The function used to extract the labels from the generated output.
         Returns:
@@ -328,7 +332,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
                 )
     def prepare_dataset(
-        self, dataset: DatasetDict, task: Task, itr_idx: int
+        self, dataset: DatasetDict, task: "Task", itr_idx: int
     ) -> DatasetDict:
         """Prepare the dataset for the model.
@@ -361,7 +365,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
                     )
             return examples
-        def tokenise(examples: dict) -> BatchEncoding:
+        def tokenise(examples: dict) -> "BatchEncoding":
             return self._tokenizer(text=examples["text"], truncation=True, padding=True)
         match task.task_group:
@@ -481,7 +485,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
     @classmethod
     def model_exists(
-        cls, model_id: str, benchmark_config: BenchmarkConfig
+        cls, model_id: str, benchmark_config: "BenchmarkConfig"
     ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
         """Check if a model exists.
@@ -508,8 +512,8 @@ class HuggingFaceEncoderModel(BenchmarkModule):
     @classmethod
     def get_model_config(
-        cls, model_id: str, benchmark_config: BenchmarkConfig
-    ) -> ModelConfig:
+        cls, model_id: str, benchmark_config: "BenchmarkConfig"
+    ) -> "ModelConfig":
         """Fetch the model configuration.
         Args:
@@ -556,10 +560,10 @@ class HuggingFaceEncoderModel(BenchmarkModule):
 def load_model_and_tokenizer(
-    model_config: ModelConfig,
-    dataset_config: DatasetConfig,
-    benchmark_config: BenchmarkConfig,
-) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
+    model_config: "ModelConfig",
+    dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
+) -> tuple["PreTrainedModel", "PreTrainedTokenizer"]:
     """Load the model and tokenizer.
     Args:
@@ -618,7 +622,7 @@ def load_model_and_tokenizer(
     # These are used when a timeout occurs
     attempts_left = 5
-    model: PreTrainedModel | None = None
+    model: "PreTrainedModel | None" = None
     while True:
         # Get the model class associated with the task group
         model_cls_or_none: t.Type["PreTrainedModel"] | None = get_class_by_name(
@@ -703,8 +707,8 @@ def load_model_and_tokenizer(
 def get_model_repo_info(
-    model_id: str, revision: str, benchmark_config: BenchmarkConfig
-) -> HFModelInfo | None:
+    model_id: str, revision: str, benchmark_config: "BenchmarkConfig"
+) -> "HFModelInfo | None":
     """Get the information about the model from the HF Hub or a local directory.
     Args:

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -11,7 +11,6 @@ from time import sleep
 import litellm
 import ollama
-from datasets import DatasetDict
 from huggingface_hub import HfApi
 from huggingface_hub.errors import (
     HFValidationError,
@@ -31,12 +30,11 @@ from litellm.exceptions import (
 )
 from litellm.llms.vertex_ai.common_utils import VertexAIError
 from litellm.router import Router
-from litellm.types.utils import ChoiceLogprobs, ModelResponse
+from litellm.types.utils import ChoiceLogprobs
 from pydantic import conlist, create_model
 from requests.exceptions import RequestException
 from tqdm.asyncio import tqdm as tqdm_async
 from tqdm.auto import tqdm
-from transformers.trainer import Trainer
 from ..constants import MAX_LOGPROBS, REASONING_MAX_TOKENS, TASKS_USING_JSON
 from ..data_models import (
@@ -78,6 +76,11 @@ from ..utils import (
 from .base import BenchmarkModule
 from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
+if t.TYPE_CHECKING:
+    from datasets import DatasetDict
+    from litellm.types.utils import ModelResponse
+    from transformers.trainer import Trainer
 logger = logging.getLogger("euroeval")
@@ -140,18 +143,15 @@ NUM_PARAMS_MAPPING = {
 ALLOWED_PARAMS = {
     # OpenAI models
-    r"gpt-4.*": [],
-    r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "high"],
+    r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
     # Anthropic models
-    r"(anthropic/)?claude-3-(haiku|sonnet|opus).*": [],
-    r"(anthropic/)?claude-3-5-.*": [],
-    r"(anthropic/)?claude-3-7-sonnet.*": ["thinking"],
+    r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
+    r"(anthropic/)?claude-(sonnet|opus)-4.*": ["no-thinking", "thinking"],
     # Gemini models
-    r"(gemini/)?gemini-.*": [],
+    r"(gemini/)?gemini-2.5-flash-lite.*": ["no-thinking", "thinking"],
+    r"(gemini/)?gemini-2.5-flash-[0-9].*": ["no-thinking", "thinking"],
     # xAI models
-    r"(xai/)?grok-2.*": [],
-    r"(xai/)?grok-3(-fast)?(-beta)?": [],
-    r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "high"],
+    r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "medium", "high"],
 }
@@ -170,18 +170,6 @@ class LiteLLMModel(BenchmarkModule):
     batching_preference = BatchingPreference.ALL_AT_ONCE
     high_priority = False
-    _handleable_exceptions = (
-        BadRequestError,
-        RateLimitError,
-        APIError,
-        APIConnectionError,
-        Timeout,
-        ServiceUnavailableError,
-        InternalServerError,
-        SystemError,
-        AuthenticationError,
-    )
     def __init__(
         self,
         model_config: ModelConfig,
@@ -240,6 +228,8 @@ class LiteLLMModel(BenchmarkModule):
             )
         elif self.model_config.revision in {"thinking"}:
             type_ = GenerativeType.REASONING
+        elif self.model_config.revision in {"no-thinking"}:
+            type_ = GenerativeType.INSTRUCTION_TUNED
         elif re.fullmatch(
             pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
         ):
@@ -370,7 +360,13 @@ class LiteLLMModel(BenchmarkModule):
                 f"Enabling thinking mode for model {self.model_config.model_id!r}",
                 level=logging.DEBUG,
             )
-        elif self.model_config.revision in {"low", "high"}:
+        elif self.model_config.revision == "no-thinking":
+            generation_kwargs["thinking"] = dict(type="disabled", budget_tokens=0)
+            log_once(
+                f"Disabling thinking mode for model {self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        elif self.model_config.revision in {"low", "medium", "high"}:
             generation_kwargs["reasoning_effort"] = self.model_config.revision
             log_once(
                 f"Enabling reasoning effort {self.model_config.revision!r} for model "
@@ -381,7 +377,7 @@ class LiteLLMModel(BenchmarkModule):
         # Drop generation kwargs that are not supported by the model
         litellm.drop_params = True
-        all_responses: dict[int, ModelResponse] = {}
+        all_responses: dict[int, "ModelResponse"] = {}
         conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
             enumerate(conversations)
         )
@@ -477,6 +473,10 @@ class LiteLLMModel(BenchmarkModule):
         ]
         max_items_messages = ["'maxItems' is not permitted."]
         no_json_schema_messages = ["Property keys should match pattern"]
+        thinking_budget_pattern = re.compile(
+            r"the thinking budget [0-9]+ is invalid. please choose a value between "
+            r"[0-9]+ and ([0-9]+)\."
+        )
         if any(msg.lower() in error_msg for msg in stop_messages):
             log_once(
@@ -537,6 +537,26 @@ class LiteLLMModel(BenchmarkModule):
             )
             generation_kwargs["response_format"] = dict(type="json_object")
             return
+        elif thinking_match := thinking_budget_pattern.search(string=error_msg):
+            thinking_budget = int(thinking_match.group(1))
+            if thinking_budget >= REASONING_MAX_TOKENS:
+                raise InvalidBenchmark(
+                    f"The model {model_id!r} has an upper thinking budget of "
+                    f"{thinking_budget:,} tokens, which is within the limit of "
+                    f"{REASONING_MAX_TOKENS:,} tokens. This should not happen. The "
+                    f"error message was: {error_msg}."
+                )
+            log_once(
+                f"The model {model_id!r} can at most use {thinking_budget:,} tokens "
+                "for reasoning, which is less than the default of "
+                f"{REASONING_MAX_TOKENS:,} tokens. Setting the thinking budget to "
+                f"{thinking_budget:,} tokens.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs["thinking"] = dict(
+                type="enabled", budget_tokens=thinking_budget - 1
+            )
+            return
         elif isinstance(
             error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
         ):
@@ -581,7 +601,7 @@ class LiteLLMModel(BenchmarkModule):
         model_id: str,
         conversations: list[list[litellm.AllMessageValues]],
         **generation_kwargs,
-    ) -> tuple[list[tuple[int, ModelResponse]], list[tuple[int, Exception]]]:
+    ) -> tuple[list[tuple[int, "ModelResponse"]], list[tuple[int, Exception]]]:
         """Generate outputs from the model asynchronously.
         Args:
@@ -641,7 +661,7 @@ class LiteLLMModel(BenchmarkModule):
     @staticmethod
     def _create_model_output(
-        model_responses: list[ModelResponse], model_id: str
+        model_responses: list["ModelResponse"], model_id: str
     ) -> GenerativeModelOutput:
         """Create a GenerativeModelOutput object from a list of ModelResponse objects.
@@ -1123,8 +1143,8 @@ class LiteLLMModel(BenchmarkModule):
         )
     def prepare_dataset(
-        self, dataset: DatasetDict, task: Task, itr_idx: int
-    ) -> DatasetDict:
+        self, dataset: "DatasetDict", task: Task, itr_idx: int
+    ) -> "DatasetDict":
         """Prepare the dataset for the model.
         This includes things like tokenisation.

EuroEval 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl

EuroEval 15.10.1py3-none-any.whl → 15.12.0py3-none-any.whl