EuroEval 15.10.0__py3-none-any.whl → 15.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

euroeval/__init__.py CHANGED
@@ -86,6 +86,13 @@ os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
86
86
  os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
87
87
 
88
88
 
89
+ # Avoid the "Unclosed client session" error when evaluating Ollama models with LiteLLM.
90
+ # The error comes from the `aiohttp` package, and this environment variable forces the
91
+ # use of `httpx` instead.
92
+ # Link: https://github.com/BerriAI/litellm/issues/11657#issuecomment-3038984975
93
+ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
94
+
95
+
89
96
  # Use older version v0 of vLLM, as the newer one requires XGrammar as decoding backend,
90
97
  # but XGrammar does not support having a maximal amount of elements in lists
91
98
  os.environ["VLLM_USE_V1"] = "0"
@@ -10,17 +10,8 @@ from functools import cached_property, partial
10
10
  from datasets import DatasetDict
11
11
  from torch import nn
12
12
  from tqdm.auto import tqdm
13
- from transformers.tokenization_utils import PreTrainedTokenizer
14
- from transformers.trainer import Trainer
15
-
16
- from ..data_models import (
17
- BenchmarkConfig,
18
- DatasetConfig,
19
- GenerativeModelOutput,
20
- ModelConfig,
21
- Task,
22
- )
23
- from ..enums import BatchingPreference, GenerativeType, TaskGroup
13
+
14
+ from ..enums import TaskGroup
24
15
  from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
25
16
  from ..task_group_utils import (
26
17
  question_answering,
@@ -28,9 +19,22 @@ from ..task_group_utils import (
28
19
  text_to_text,
29
20
  token_classification,
30
21
  )
31
- from ..types import ComputeMetricsFunction, ExtractLabelsFunction
32
22
  from ..utils import log_once
33
23
 
24
+ if t.TYPE_CHECKING:
25
+ from transformers.tokenization_utils import PreTrainedTokenizer
26
+ from transformers.trainer import Trainer
27
+
28
+ from ..data_models import (
29
+ BenchmarkConfig,
30
+ DatasetConfig,
31
+ GenerativeModelOutput,
32
+ ModelConfig,
33
+ Task,
34
+ )
35
+ from ..enums import BatchingPreference, GenerativeType
36
+ from ..types import ComputeMetricsFunction, ExtractLabelsFunction
37
+
34
38
  logger = logging.getLogger("euroeval")
35
39
 
36
40
 
@@ -49,14 +53,14 @@ class BenchmarkModule(ABC):
49
53
  """
50
54
 
51
55
  fresh_model: bool
52
- batching_preference: BatchingPreference
56
+ batching_preference: "BatchingPreference"
53
57
  high_priority: bool
54
58
 
55
59
  def __init__(
56
60
  self,
57
- model_config: ModelConfig,
58
- dataset_config: DatasetConfig,
59
- benchmark_config: BenchmarkConfig,
61
+ model_config: "ModelConfig",
62
+ dataset_config: "DatasetConfig",
63
+ benchmark_config: "BenchmarkConfig",
60
64
  ) -> None:
61
65
  """Initialise the benchmark module.
62
66
 
@@ -138,7 +142,7 @@ class BenchmarkModule(ABC):
138
142
 
139
143
  @property
140
144
  @abstractmethod
141
- def generative_type(self) -> GenerativeType | None:
145
+ def generative_type(self) -> "GenerativeType | None":
142
146
  """Get the generative type of the model.
143
147
 
144
148
  Returns:
@@ -177,7 +181,7 @@ class BenchmarkModule(ABC):
177
181
  ...
178
182
 
179
183
  @property
180
- def compute_metrics(self) -> ComputeMetricsFunction:
184
+ def compute_metrics(self) -> "ComputeMetricsFunction":
181
185
  """The function used to compute the metrics.
182
186
 
183
187
  Returns:
@@ -188,13 +192,11 @@ class BenchmarkModule(ABC):
188
192
  return partial(
189
193
  sequence_classification.compute_metrics,
190
194
  dataset_config=self.dataset_config,
191
- benchmark_config=self.benchmark_config,
192
195
  )
193
196
  case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
194
197
  return partial(
195
198
  sequence_classification.compute_metrics,
196
199
  dataset_config=self.dataset_config,
197
- benchmark_config=self.benchmark_config,
198
200
  )
199
201
  case TaskGroup.TEXT_TO_TEXT:
200
202
  return partial(
@@ -207,13 +209,11 @@ class BenchmarkModule(ABC):
207
209
  token_classification.compute_metrics,
208
210
  has_misc_tags=self.buffer.get("has_misc_tags", True),
209
211
  dataset_config=self.dataset_config,
210
- benchmark_config=self.benchmark_config,
211
212
  )
212
213
  case TaskGroup.QUESTION_ANSWERING:
213
214
  return partial(
214
215
  question_answering.compute_metrics,
215
216
  dataset_config=self.dataset_config,
216
- benchmark_config=self.benchmark_config,
217
217
  )
218
218
  case _:
219
219
  raise NotImplementedError(
@@ -222,7 +222,7 @@ class BenchmarkModule(ABC):
222
222
 
223
223
  @property
224
224
  @abstractmethod
225
- def extract_labels_from_generation(self) -> ExtractLabelsFunction:
225
+ def extract_labels_from_generation(self) -> "ExtractLabelsFunction":
226
226
  """The function used to extract the labels from the generated output.
227
227
 
228
228
  Returns:
@@ -241,7 +241,7 @@ class BenchmarkModule(ABC):
241
241
  ...
242
242
 
243
243
  def prepare_datasets(
244
- self, datasets: list[DatasetDict], task: Task
244
+ self, datasets: list[DatasetDict], task: "Task"
245
245
  ) -> list[DatasetDict]:
246
246
  """Prepare the datasets for the model.
247
247
 
@@ -283,7 +283,7 @@ class BenchmarkModule(ABC):
283
283
 
284
284
  @abstractmethod
285
285
  def prepare_dataset(
286
- self, dataset: DatasetDict, task: Task, itr_idx: int
286
+ self, dataset: DatasetDict, task: "Task", itr_idx: int
287
287
  ) -> DatasetDict:
288
288
  """Prepare the dataset for the model.
289
289
 
@@ -302,7 +302,7 @@ class BenchmarkModule(ABC):
302
302
  """
303
303
  ...
304
304
 
305
- def generate(self, inputs: dict) -> GenerativeModelOutput:
305
+ def generate(self, inputs: dict) -> "GenerativeModelOutput":
306
306
  """Generate outputs from the model.
307
307
 
308
308
  Args:
@@ -320,7 +320,7 @@ class BenchmarkModule(ABC):
320
320
  @classmethod
321
321
  @abstractmethod
322
322
  def model_exists(
323
- cls, model_id: str, benchmark_config: BenchmarkConfig
323
+ cls, model_id: str, benchmark_config: "BenchmarkConfig"
324
324
  ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
325
325
  """Check if a model exists.
326
326
 
@@ -339,8 +339,8 @@ class BenchmarkModule(ABC):
339
339
  @classmethod
340
340
  @abstractmethod
341
341
  def get_model_config(
342
- cls, model_id: str, benchmark_config: BenchmarkConfig
343
- ) -> ModelConfig:
342
+ cls, model_id: str, benchmark_config: "BenchmarkConfig"
343
+ ) -> "ModelConfig":
344
344
  """Fetch the model configuration.
345
345
 
346
346
  Args:
@@ -1,11 +1,10 @@
1
1
  """Freshly initialised encoder models."""
2
2
 
3
3
  import os
4
+ import typing as t
4
5
  from functools import cached_property
5
6
  from json import JSONDecodeError
6
7
 
7
- from transformers.configuration_utils import PretrainedConfig
8
- from transformers.modeling_utils import PreTrainedModel
9
8
  from transformers.models.auto.configuration_auto import AutoConfig
10
9
  from transformers.models.auto.tokenization_auto import AutoTokenizer
11
10
  from transformers.models.electra import (
@@ -18,9 +17,8 @@ from transformers.models.xlm_roberta import (
18
17
  XLMRobertaForSequenceClassification,
19
18
  XLMRobertaForTokenClassification,
20
19
  )
21
- from transformers.tokenization_utils import PreTrainedTokenizer
22
20
 
23
- from ..data_models import BenchmarkConfig, DatasetConfig, ModelConfig
21
+ from ..data_models import ModelConfig
24
22
  from ..enums import InferenceBackend, ModelType, TaskGroup
25
23
  from ..exceptions import (
26
24
  InvalidBenchmark,
@@ -35,6 +33,13 @@ from .hf import (
35
33
  setup_model_for_question_answering,
36
34
  )
37
35
 
36
+ if t.TYPE_CHECKING:
37
+ from transformers.configuration_utils import PretrainedConfig
38
+ from transformers.modeling_utils import PreTrainedModel
39
+ from transformers.tokenization_utils import PreTrainedTokenizer
40
+
41
+ from ..data_models import BenchmarkConfig, DatasetConfig
42
+
38
43
 
39
44
  class FreshEncoderModel(HuggingFaceEncoderModel):
40
45
  """A freshly initialised encoder model."""
@@ -43,9 +48,9 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
43
48
 
44
49
  def __init__(
45
50
  self,
46
- model_config: ModelConfig,
47
- dataset_config: DatasetConfig,
48
- benchmark_config: BenchmarkConfig,
51
+ model_config: "ModelConfig",
52
+ dataset_config: "DatasetConfig",
53
+ benchmark_config: "BenchmarkConfig",
49
54
  ) -> None:
50
55
  """Initialise the model.
51
56
 
@@ -67,8 +72,8 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
67
72
  benchmark_config=benchmark_config,
68
73
  model_max_length=self.model_max_length,
69
74
  )
70
- self._model: PreTrainedModel = model
71
- self._tokenizer: PreTrainedTokenizer = tokenizer
75
+ self._model: "PreTrainedModel" = model
76
+ self._tokenizer: "PreTrainedTokenizer" = tokenizer
72
77
 
73
78
  self._model, self._tokenizer = align_model_and_tokenizer(
74
79
  model=self._model,
@@ -141,7 +146,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
141
146
 
142
147
  @classmethod
143
148
  def model_exists(
144
- cls, model_id: str, benchmark_config: BenchmarkConfig
149
+ cls, model_id: str, benchmark_config: "BenchmarkConfig"
145
150
  ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
146
151
  """Check if a model exists.
147
152
 
@@ -160,8 +165,8 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
160
165
 
161
166
  @classmethod
162
167
  def get_model_config(
163
- cls, model_id: str, benchmark_config: BenchmarkConfig
164
- ) -> ModelConfig:
168
+ cls, model_id: str, benchmark_config: "BenchmarkConfig"
169
+ ) -> "ModelConfig":
165
170
  """Fetch the model configuration.
166
171
 
167
172
  Args:
@@ -190,11 +195,11 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
190
195
 
191
196
 
192
197
  def load_model_and_tokenizer(
193
- model_config: ModelConfig,
194
- dataset_config: DatasetConfig,
195
- benchmark_config: BenchmarkConfig,
198
+ model_config: "ModelConfig",
199
+ dataset_config: "DatasetConfig",
200
+ benchmark_config: "BenchmarkConfig",
196
201
  model_max_length: int,
197
- ) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
202
+ ) -> "tuple[PreTrainedModel, PreTrainedTokenizer]":
198
203
  """Load the model and tokenizer.
199
204
 
200
205
  Args:
@@ -248,12 +253,19 @@ def load_model_and_tokenizer(
248
253
  )
249
254
  model_cls = model_cls_mapping[model_id]
250
255
 
256
+ # Special case where there is a mismatch between the labels during training and
257
+ # testing
258
+ if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
259
+ id2label = {0: "0", 1: "1"}
260
+ else:
261
+ id2label = dataset_config.id2label
262
+
251
263
  config = AutoConfig.from_pretrained(
252
264
  real_model_id,
253
265
  token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
254
- num_labels=dataset_config.num_labels,
255
- id2label=dataset_config.id2label,
256
- label2id=dataset_config.label2id,
266
+ num_labels=len(id2label),
267
+ id2label=id2label,
268
+ label2id={label: id_ for id_, label in id2label.items()},
257
269
  cache_dir=model_config.model_cache_dir,
258
270
  trust_remote_code=benchmark_config.trust_remote_code,
259
271
  )
@@ -24,7 +24,6 @@ from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
24
24
  from peft import PeftConfig
25
25
  from requests.exceptions import RequestException
26
26
  from torch import nn
27
- from transformers.configuration_utils import PretrainedConfig
28
27
  from transformers.data.data_collator import (
29
28
  DataCollatorForTokenClassification,
30
29
  DataCollatorWithPadding,
@@ -33,8 +32,6 @@ from transformers.modelcard import TASK_MAPPING
33
32
  from transformers.modeling_utils import PreTrainedModel
34
33
  from transformers.models.auto.configuration_auto import AutoConfig
35
34
  from transformers.models.auto.tokenization_auto import AutoTokenizer
36
- from transformers.tokenization_utils import PreTrainedTokenizer
37
- from transformers.tokenization_utils_base import BatchEncoding
38
35
  from transformers.trainer import Trainer
39
36
  from urllib3.exceptions import RequestError
40
37
 
@@ -45,7 +42,7 @@ from ..constants import (
45
42
  MAX_CONTEXT_LENGTH,
46
43
  MERGE_TAGS,
47
44
  )
48
- from ..data_models import BenchmarkConfig, DatasetConfig, HFModelInfo, ModelConfig, Task
45
+ from ..data_models import HFModelInfo, ModelConfig
49
46
  from ..enums import (
50
47
  BatchingPreference,
51
48
  GenerativeType,
@@ -67,7 +64,6 @@ from ..task_group_utils import (
67
64
  token_classification,
68
65
  )
69
66
  from ..tokenization_utils import get_bos_token, get_eos_token
70
- from ..types import ExtractLabelsFunction
71
67
  from ..utils import (
72
68
  block_terminal_output,
73
69
  create_model_cache_dir,
@@ -77,6 +73,14 @@ from ..utils import (
77
73
  )
78
74
  from .base import BenchmarkModule
79
75
 
76
+ if t.TYPE_CHECKING:
77
+ from transformers.configuration_utils import PretrainedConfig
78
+ from transformers.tokenization_utils import PreTrainedTokenizer
79
+ from transformers.tokenization_utils_base import BatchEncoding
80
+
81
+ from ..data_models import BenchmarkConfig, DatasetConfig, Task
82
+ from ..types import ExtractLabelsFunction
83
+
80
84
  logger = logging.getLogger("euroeval")
81
85
 
82
86
 
@@ -89,9 +93,9 @@ class HuggingFaceEncoderModel(BenchmarkModule):
89
93
 
90
94
  def __init__(
91
95
  self,
92
- model_config: ModelConfig,
93
- dataset_config: DatasetConfig,
94
- benchmark_config: BenchmarkConfig,
96
+ model_config: "ModelConfig",
97
+ dataset_config: "DatasetConfig",
98
+ benchmark_config: "BenchmarkConfig",
95
99
  ) -> None:
96
100
  """Initialise the model.
97
101
 
@@ -108,8 +112,8 @@ class HuggingFaceEncoderModel(BenchmarkModule):
108
112
  dataset_config=dataset_config,
109
113
  benchmark_config=benchmark_config,
110
114
  )
111
- self._model: PreTrainedModel = model
112
- self._tokenizer: PreTrainedTokenizer = tokenizer
115
+ self._model: "PreTrainedModel" = model
116
+ self._tokenizer: "PreTrainedTokenizer" = tokenizer
113
117
 
114
118
  self._model, self._tokenizer = align_model_and_tokenizer(
115
119
  model=self._model,
@@ -291,7 +295,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
291
295
  return None
292
296
 
293
297
  @property
294
- def extract_labels_from_generation(self) -> ExtractLabelsFunction:
298
+ def extract_labels_from_generation(self) -> "ExtractLabelsFunction":
295
299
  """The function used to extract the labels from the generated output.
296
300
 
297
301
  Returns:
@@ -328,7 +332,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
328
332
  )
329
333
 
330
334
  def prepare_dataset(
331
- self, dataset: DatasetDict, task: Task, itr_idx: int
335
+ self, dataset: DatasetDict, task: "Task", itr_idx: int
332
336
  ) -> DatasetDict:
333
337
  """Prepare the dataset for the model.
334
338
 
@@ -361,7 +365,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
361
365
  )
362
366
  return examples
363
367
 
364
- def tokenise(examples: dict) -> BatchEncoding:
368
+ def tokenise(examples: dict) -> "BatchEncoding":
365
369
  return self._tokenizer(text=examples["text"], truncation=True, padding=True)
366
370
 
367
371
  match task.task_group:
@@ -481,7 +485,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
481
485
 
482
486
  @classmethod
483
487
  def model_exists(
484
- cls, model_id: str, benchmark_config: BenchmarkConfig
488
+ cls, model_id: str, benchmark_config: "BenchmarkConfig"
485
489
  ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
486
490
  """Check if a model exists.
487
491
 
@@ -508,8 +512,8 @@ class HuggingFaceEncoderModel(BenchmarkModule):
508
512
 
509
513
  @classmethod
510
514
  def get_model_config(
511
- cls, model_id: str, benchmark_config: BenchmarkConfig
512
- ) -> ModelConfig:
515
+ cls, model_id: str, benchmark_config: "BenchmarkConfig"
516
+ ) -> "ModelConfig":
513
517
  """Fetch the model configuration.
514
518
 
515
519
  Args:
@@ -556,10 +560,10 @@ class HuggingFaceEncoderModel(BenchmarkModule):
556
560
 
557
561
 
558
562
  def load_model_and_tokenizer(
559
- model_config: ModelConfig,
560
- dataset_config: DatasetConfig,
561
- benchmark_config: BenchmarkConfig,
562
- ) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
563
+ model_config: "ModelConfig",
564
+ dataset_config: "DatasetConfig",
565
+ benchmark_config: "BenchmarkConfig",
566
+ ) -> tuple["PreTrainedModel", "PreTrainedTokenizer"]:
563
567
  """Load the model and tokenizer.
564
568
 
565
569
  Args:
@@ -618,7 +622,7 @@ def load_model_and_tokenizer(
618
622
  # These are used when a timeout occurs
619
623
  attempts_left = 5
620
624
 
621
- model: PreTrainedModel | None = None
625
+ model: "PreTrainedModel | None" = None
622
626
  while True:
623
627
  # Get the model class associated with the task group
624
628
  model_cls_or_none: t.Type["PreTrainedModel"] | None = get_class_by_name(
@@ -703,8 +707,8 @@ def load_model_and_tokenizer(
703
707
 
704
708
 
705
709
  def get_model_repo_info(
706
- model_id: str, revision: str, benchmark_config: BenchmarkConfig
707
- ) -> HFModelInfo | None:
710
+ model_id: str, revision: str, benchmark_config: "BenchmarkConfig"
711
+ ) -> "HFModelInfo | None":
708
712
  """Get the information about the model from the HF Hub or a local directory.
709
713
 
710
714
  Args:
@@ -11,7 +11,6 @@ from time import sleep
11
11
 
12
12
  import litellm
13
13
  import ollama
14
- from datasets import DatasetDict
15
14
  from huggingface_hub import HfApi
16
15
  from huggingface_hub.errors import (
17
16
  HFValidationError,
@@ -31,12 +30,11 @@ from litellm.exceptions import (
31
30
  )
32
31
  from litellm.llms.vertex_ai.common_utils import VertexAIError
33
32
  from litellm.router import Router
34
- from litellm.types.utils import ChoiceLogprobs, ModelResponse
33
+ from litellm.types.utils import ChoiceLogprobs
35
34
  from pydantic import conlist, create_model
36
35
  from requests.exceptions import RequestException
37
36
  from tqdm.asyncio import tqdm as tqdm_async
38
37
  from tqdm.auto import tqdm
39
- from transformers.trainer import Trainer
40
38
 
41
39
  from ..constants import MAX_LOGPROBS, REASONING_MAX_TOKENS, TASKS_USING_JSON
42
40
  from ..data_models import (
@@ -78,6 +76,11 @@ from ..utils import (
78
76
  from .base import BenchmarkModule
79
77
  from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
80
78
 
79
+ if t.TYPE_CHECKING:
80
+ from datasets import DatasetDict
81
+ from litellm.types.utils import ModelResponse
82
+ from transformers.trainer import Trainer
83
+
81
84
  logger = logging.getLogger("euroeval")
82
85
 
83
86
 
@@ -140,18 +143,15 @@ NUM_PARAMS_MAPPING = {
140
143
 
141
144
  ALLOWED_PARAMS = {
142
145
  # OpenAI models
143
- r"gpt-4.*": [],
144
- r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "high"],
146
+ r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
145
147
  # Anthropic models
146
- r"(anthropic/)?claude-3-(haiku|sonnet|opus).*": [],
147
- r"(anthropic/)?claude-3-5-.*": [],
148
- r"(anthropic/)?claude-3-7-sonnet.*": ["thinking"],
148
+ r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
149
+ r"(anthropic/)?claude-(sonnet|opus)-4.*": ["no-thinking", "thinking"],
149
150
  # Gemini models
150
- r"(gemini/)?gemini-.*": [],
151
+ r"(gemini/)?gemini-2.5-flash-lite.*": ["no-thinking", "thinking"],
152
+ r"(gemini/)?gemini-2.5-flash-[0-9].*": ["no-thinking", "thinking"],
151
153
  # xAI models
152
- r"(xai/)?grok-2.*": [],
153
- r"(xai/)?grok-3(-fast)?(-beta)?": [],
154
- r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "high"],
154
+ r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "medium", "high"],
155
155
  }
156
156
 
157
157
 
@@ -170,18 +170,6 @@ class LiteLLMModel(BenchmarkModule):
170
170
  batching_preference = BatchingPreference.ALL_AT_ONCE
171
171
  high_priority = False
172
172
 
173
- _handleable_exceptions = (
174
- BadRequestError,
175
- RateLimitError,
176
- APIError,
177
- APIConnectionError,
178
- Timeout,
179
- ServiceUnavailableError,
180
- InternalServerError,
181
- SystemError,
182
- AuthenticationError,
183
- )
184
-
185
173
  def __init__(
186
174
  self,
187
175
  model_config: ModelConfig,
@@ -240,6 +228,8 @@ class LiteLLMModel(BenchmarkModule):
240
228
  )
241
229
  elif self.model_config.revision in {"thinking"}:
242
230
  type_ = GenerativeType.REASONING
231
+ elif self.model_config.revision in {"no-thinking"}:
232
+ type_ = GenerativeType.INSTRUCTION_TUNED
243
233
  elif re.fullmatch(
244
234
  pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
245
235
  ):
@@ -370,7 +360,13 @@ class LiteLLMModel(BenchmarkModule):
370
360
  f"Enabling thinking mode for model {self.model_config.model_id!r}",
371
361
  level=logging.DEBUG,
372
362
  )
373
- elif self.model_config.revision in {"low", "high"}:
363
+ elif self.model_config.revision == "no-thinking":
364
+ generation_kwargs["thinking"] = dict(type="disabled", budget_tokens=0)
365
+ log_once(
366
+ f"Disabling thinking mode for model {self.model_config.model_id!r}",
367
+ level=logging.DEBUG,
368
+ )
369
+ elif self.model_config.revision in {"low", "medium", "high"}:
374
370
  generation_kwargs["reasoning_effort"] = self.model_config.revision
375
371
  log_once(
376
372
  f"Enabling reasoning effort {self.model_config.revision!r} for model "
@@ -381,7 +377,7 @@ class LiteLLMModel(BenchmarkModule):
381
377
  # Drop generation kwargs that are not supported by the model
382
378
  litellm.drop_params = True
383
379
 
384
- all_responses: dict[int, ModelResponse] = {}
380
+ all_responses: dict[int, "ModelResponse"] = {}
385
381
  conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
386
382
  enumerate(conversations)
387
383
  )
@@ -477,6 +473,10 @@ class LiteLLMModel(BenchmarkModule):
477
473
  ]
478
474
  max_items_messages = ["'maxItems' is not permitted."]
479
475
  no_json_schema_messages = ["Property keys should match pattern"]
476
+ thinking_budget_pattern = re.compile(
477
+ r"the thinking budget [0-9]+ is invalid. please choose a value between "
478
+ r"[0-9]+ and ([0-9]+)\."
479
+ )
480
480
 
481
481
  if any(msg.lower() in error_msg for msg in stop_messages):
482
482
  log_once(
@@ -537,6 +537,26 @@ class LiteLLMModel(BenchmarkModule):
537
537
  )
538
538
  generation_kwargs["response_format"] = dict(type="json_object")
539
539
  return
540
+ elif thinking_match := thinking_budget_pattern.search(string=error_msg):
541
+ thinking_budget = int(thinking_match.group(1))
542
+ if thinking_budget >= REASONING_MAX_TOKENS:
543
+ raise InvalidBenchmark(
544
+ f"The model {model_id!r} has an upper thinking budget of "
545
+ f"{thinking_budget:,} tokens, which is within the limit of "
546
+ f"{REASONING_MAX_TOKENS:,} tokens. This should not happen. The "
547
+ f"error message was: {error_msg}."
548
+ )
549
+ log_once(
550
+ f"The model {model_id!r} can at most use {thinking_budget:,} tokens "
551
+ "for reasoning, which is less than the default of "
552
+ f"{REASONING_MAX_TOKENS:,} tokens. Setting the thinking budget to "
553
+ f"{thinking_budget:,} tokens.",
554
+ level=logging.DEBUG,
555
+ )
556
+ generation_kwargs["thinking"] = dict(
557
+ type="enabled", budget_tokens=thinking_budget - 1
558
+ )
559
+ return
540
560
  elif isinstance(
541
561
  error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
542
562
  ):
@@ -581,7 +601,7 @@ class LiteLLMModel(BenchmarkModule):
581
601
  model_id: str,
582
602
  conversations: list[list[litellm.AllMessageValues]],
583
603
  **generation_kwargs,
584
- ) -> tuple[list[tuple[int, ModelResponse]], list[tuple[int, Exception]]]:
604
+ ) -> tuple[list[tuple[int, "ModelResponse"]], list[tuple[int, Exception]]]:
585
605
  """Generate outputs from the model asynchronously.
586
606
 
587
607
  Args:
@@ -641,7 +661,7 @@ class LiteLLMModel(BenchmarkModule):
641
661
 
642
662
  @staticmethod
643
663
  def _create_model_output(
644
- model_responses: list[ModelResponse], model_id: str
664
+ model_responses: list["ModelResponse"], model_id: str
645
665
  ) -> GenerativeModelOutput:
646
666
  """Create a GenerativeModelOutput object from a list of ModelResponse objects.
647
667
 
@@ -1123,8 +1143,8 @@ class LiteLLMModel(BenchmarkModule):
1123
1143
  )
1124
1144
 
1125
1145
  def prepare_dataset(
1126
- self, dataset: DatasetDict, task: Task, itr_idx: int
1127
- ) -> DatasetDict:
1146
+ self, dataset: "DatasetDict", task: Task, itr_idx: int
1147
+ ) -> "DatasetDict":
1128
1148
  """Prepare the dataset for the model.
1129
1149
 
1130
1150
  This includes things like tokenisation.